@agentv/core 4.14.0-next.1 → 4.15.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -5856,10 +5856,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
5856
5856
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
5857
5857
  const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
5858
5858
  const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
5859
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
5859
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0 || Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0;
5860
5860
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
5861
5861
  logError3(
5862
- `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
5862
+ `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`
5863
5863
  );
5864
5864
  continue;
5865
5865
  }
@@ -5936,6 +5936,14 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
5936
5936
  ) : void 0;
5937
5937
  const onDependencyFailureRaw = asString5(testCaseConfig.on_dependency_failure);
5938
5938
  const onDependencyFailure = onDependencyFailureRaw === "skip" || onDependencyFailureRaw === "fail" || onDependencyFailureRaw === "run" ? onDependencyFailureRaw : void 0;
5939
+ const modeRaw = asString5(testCaseConfig.mode);
5940
+ const mode = modeRaw === "conversation" ? "conversation" : void 0;
5941
+ const turns = Array.isArray(testCaseConfig.turns) ? parseTurns(testCaseConfig.turns) : void 0;
5942
+ const aggregationRaw = asString5(testCaseConfig.aggregation);
5943
+ const aggregation = aggregationRaw === "mean" || aggregationRaw === "min" || aggregationRaw === "max" ? aggregationRaw : void 0;
5944
+ const onTurnFailureRaw = asString5(testCaseConfig.on_turn_failure);
5945
+ const onTurnFailure = onTurnFailureRaw === "continue" || onTurnFailureRaw === "stop" ? onTurnFailureRaw : void 0;
5946
+ const windowSize = typeof testCaseConfig.window_size === "number" && testCaseConfig.window_size >= 1 ? testCaseConfig.window_size : void 0;
5939
5947
  const testCase = {
5940
5948
  id,
5941
5949
  suite: suiteName,
@@ -5954,6 +5962,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
5954
5962
  metadata,
5955
5963
  targets: caseTargets,
5956
5964
  ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {},
5965
+ ...mode ? { mode } : {},
5966
+ ...turns && turns.length > 0 ? { turns } : {},
5967
+ ...aggregation ? { aggregation } : {},
5968
+ ...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
5969
+ ...windowSize !== void 0 ? { window_size: windowSize } : {},
5957
5970
  ...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
5958
5971
  ...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
5959
5972
  };
@@ -5971,6 +5984,25 @@ async function loadTestById(evalFilePath, repoRoot, evalId) {
5971
5984
  return match;
5972
5985
  }
5973
5986
  var loadEvalCaseById = loadTestById;
5987
+ function parseTurns(rawTurns) {
5988
+ return rawTurns.map((rawTurn) => {
5989
+ const turn = rawTurn;
5990
+ const input = turn.input;
5991
+ const expectedOutput = turn.expected_output;
5992
+ let assertions;
5993
+ if (Array.isArray(turn.assertions)) {
5994
+ assertions = turn.assertions.map((a) => {
5995
+ if (typeof a === "string") return a;
5996
+ return a;
5997
+ });
5998
+ }
5999
+ return {
6000
+ input,
6001
+ ...expectedOutput !== void 0 ? { expected_output: expectedOutput } : {},
6002
+ ...assertions && assertions.length > 0 ? { assertions } : {}
6003
+ };
6004
+ });
6005
+ }
5974
6006
  function parseCommandArray(source) {
5975
6007
  if (typeof source === "string") {
5976
6008
  const parts = source.trim().split(/\s+/);
@@ -7053,6 +7085,155 @@ function subscribeToClaudeLogEntries(listener) {
7053
7085
  };
7054
7086
  }
7055
7087
 
7088
+ // src/evaluation/providers/normalize-tool-call.ts
7089
+ init_cjs_shims();
7090
+ var TOOL_NAME_MAP = /* @__PURE__ */ new Map([
7091
+ // --- Claude (already canonical) ---
7092
+ ["claude::Skill", "Skill"],
7093
+ ["claude::Read", "Read"],
7094
+ ["claude::Write", "Write"],
7095
+ ["claude::Edit", "Edit"],
7096
+ ["claude::Bash", "Bash"],
7097
+ ["claude-cli::Skill", "Skill"],
7098
+ ["claude-cli::Read", "Read"],
7099
+ ["claude-cli::Write", "Write"],
7100
+ ["claude-cli::Edit", "Edit"],
7101
+ ["claude-cli::Bash", "Bash"],
7102
+ ["claude-sdk::Skill", "Skill"],
7103
+ ["claude-sdk::Read", "Read"],
7104
+ ["claude-sdk::Write", "Write"],
7105
+ ["claude-sdk::Edit", "Edit"],
7106
+ ["claude-sdk::Bash", "Bash"],
7107
+ // --- Copilot ---
7108
+ ["copilot-cli::Skill", "Skill"],
7109
+ ["copilot-cli::skill", "Skill"],
7110
+ ["copilot-cli::Read File", "Read"],
7111
+ ["copilot-cli::readFile", "Read"],
7112
+ ["copilot-cli::Read", "Read"],
7113
+ ["copilot-cli::readTextFile", "Read"],
7114
+ ["copilot-cli::writeTextFile", "Write"],
7115
+ ["copilot-cli::Write File", "Write"],
7116
+ ["copilot-cli::editFile", "Edit"],
7117
+ ["copilot-cli::Edit File", "Edit"],
7118
+ ["copilot-cli::runTerminalCommand", "Bash"],
7119
+ ["copilot-sdk::Skill", "Skill"],
7120
+ ["copilot-sdk::skill", "Skill"],
7121
+ ["copilot-sdk::Read File", "Read"],
7122
+ ["copilot-sdk::readFile", "Read"],
7123
+ ["copilot-sdk::Read", "Read"],
7124
+ ["copilot-sdk::readTextFile", "Read"],
7125
+ ["copilot-sdk::writeTextFile", "Write"],
7126
+ ["copilot-sdk::Write File", "Write"],
7127
+ ["copilot-sdk::editFile", "Edit"],
7128
+ ["copilot-sdk::Edit File", "Edit"],
7129
+ ["copilot-sdk::runTerminalCommand", "Bash"],
7130
+ ["copilot-log::Skill", "Skill"],
7131
+ ["copilot-log::skill", "Skill"],
7132
+ ["copilot-log::Read File", "Read"],
7133
+ ["copilot-log::readFile", "Read"],
7134
+ ["copilot-log::Read", "Read"],
7135
+ ["copilot-log::readTextFile", "Read"],
7136
+ ["copilot-log::writeTextFile", "Write"],
7137
+ ["copilot-log::Write File", "Write"],
7138
+ ["copilot-log::editFile", "Edit"],
7139
+ ["copilot-log::Edit File", "Edit"],
7140
+ ["copilot-log::runTerminalCommand", "Bash"],
7141
+ ["vscode::Skill", "Skill"],
7142
+ ["vscode::skill", "Skill"],
7143
+ ["vscode::Read File", "Read"],
7144
+ ["vscode::readFile", "Read"],
7145
+ ["vscode::Read", "Read"],
7146
+ ["vscode::readTextFile", "Read"],
7147
+ ["vscode::writeTextFile", "Write"],
7148
+ ["vscode::Write File", "Write"],
7149
+ ["vscode::editFile", "Edit"],
7150
+ ["vscode::Edit File", "Edit"],
7151
+ ["vscode::runTerminalCommand", "Bash"],
7152
+ ["vscode-insiders::Skill", "Skill"],
7153
+ ["vscode-insiders::skill", "Skill"],
7154
+ ["vscode-insiders::Read File", "Read"],
7155
+ ["vscode-insiders::readFile", "Read"],
7156
+ ["vscode-insiders::Read", "Read"],
7157
+ ["vscode-insiders::readTextFile", "Read"],
7158
+ ["vscode-insiders::writeTextFile", "Write"],
7159
+ ["vscode-insiders::Write File", "Write"],
7160
+ ["vscode-insiders::editFile", "Edit"],
7161
+ ["vscode-insiders::Edit File", "Edit"],
7162
+ ["vscode-insiders::runTerminalCommand", "Bash"],
7163
+ // --- Codex ---
7164
+ ["codex::command_execution", "Bash"],
7165
+ ["codex::file_change", "Edit"],
7166
+ // --- Pi ---
7167
+ ["pi-coding-agent::read", "Read"],
7168
+ ["pi-coding-agent::bash", "Bash"],
7169
+ ["pi-cli::read", "Read"],
7170
+ ["pi-cli::bash", "Bash"]
7171
+ ]);
7172
+ var COPILOT_PREFIXES = [
7173
+ { prefix: "Using skill: ", canonical: "Skill", extractSkillFromName: true },
7174
+ { prefix: "Viewing ", canonical: "Read" }
7175
+ ];
7176
+ var CODEX_PREFIXES = [
7177
+ { prefix: "mcp:", canonical: "Skill", extractSkillFromName: true }
7178
+ ];
7179
+ var TOOL_PREFIX_MAP = /* @__PURE__ */ new Map([
7180
+ ["copilot-cli", COPILOT_PREFIXES],
7181
+ ["copilot-sdk", COPILOT_PREFIXES],
7182
+ ["copilot-log", COPILOT_PREFIXES],
7183
+ ["vscode", COPILOT_PREFIXES],
7184
+ ["vscode-insiders", COPILOT_PREFIXES],
7185
+ ["codex", CODEX_PREFIXES]
7186
+ ]);
7187
+ var normalizeSkillInput = (input) => {
7188
+ if (input.skill !== void 0) return input;
7189
+ return input;
7190
+ };
7191
+ var normalizeReadInput = (input) => {
7192
+ if (input.file_path !== void 0) return input;
7193
+ if (input.path !== void 0) return { ...input, file_path: input.path };
7194
+ if (input.filePath !== void 0) return { ...input, file_path: input.filePath };
7195
+ return input;
7196
+ };
7197
+ var INPUT_NORMALIZERS = /* @__PURE__ */ new Map([
7198
+ ["Skill", normalizeSkillInput],
7199
+ ["Read", normalizeReadInput]
7200
+ ]);
7201
+ function normalizeToolCall(providerKind, tc) {
7202
+ const nativeName = tc.tool;
7203
+ const exactKey = `${providerKind}::${nativeName}`;
7204
+ const canonical = TOOL_NAME_MAP.get(exactKey);
7205
+ if (canonical) {
7206
+ return applyInputNormalization(canonical, { ...tc, tool: canonical });
7207
+ }
7208
+ const prefixRules = TOOL_PREFIX_MAP.get(providerKind);
7209
+ if (prefixRules) {
7210
+ for (const rule of prefixRules) {
7211
+ if (nativeName.startsWith(rule.prefix)) {
7212
+ const suffix = nativeName.slice(rule.prefix.length);
7213
+ let normalizedInput = tc.input;
7214
+ if (rule.extractSkillFromName && suffix) {
7215
+ const existingInput = tc.input ?? {};
7216
+ normalizedInput = { ...existingInput, skill: suffix };
7217
+ }
7218
+ const normalized = {
7219
+ ...tc,
7220
+ tool: rule.canonical,
7221
+ input: normalizedInput
7222
+ };
7223
+ return applyInputNormalization(rule.canonical, normalized);
7224
+ }
7225
+ }
7226
+ }
7227
+ return tc;
7228
+ }
7229
+ function applyInputNormalization(canonical, tc) {
7230
+ const normalizer = INPUT_NORMALIZERS.get(canonical);
7231
+ if (!normalizer || tc.input === void 0 || tc.input === null) return tc;
7232
+ const input = tc.input;
7233
+ const normalized = normalizer(input);
7234
+ return normalized === input ? tc : { ...tc, input: normalized };
7235
+ }
7236
+
7056
7237
  // src/evaluation/providers/preread.ts
7057
7238
  init_cjs_shims();
7058
7239
  var import_node_path12 = __toESM(require("path"), 1);
@@ -7521,11 +7702,13 @@ function extractToolCalls(content) {
7521
7702
  }
7522
7703
  const p = part;
7523
7704
  if (p.type === "tool_use" && typeof p.name === "string") {
7524
- toolCalls.push({
7525
- tool: p.name,
7526
- input: p.input,
7527
- id: typeof p.id === "string" ? p.id : void 0
7528
- });
7705
+ toolCalls.push(
7706
+ normalizeToolCall("claude-cli", {
7707
+ tool: p.name,
7708
+ input: p.input,
7709
+ id: typeof p.id === "string" ? p.id : void 0
7710
+ })
7711
+ );
7529
7712
  }
7530
7713
  }
7531
7714
  return toolCalls;
@@ -7817,11 +8000,13 @@ function extractToolCalls2(content) {
7817
8000
  }
7818
8001
  const p = part;
7819
8002
  if (p.type === "tool_use" && typeof p.name === "string") {
7820
- toolCalls.push({
7821
- tool: p.name,
7822
- input: p.input,
7823
- id: typeof p.id === "string" ? p.id : void 0
7824
- });
8003
+ toolCalls.push(
8004
+ normalizeToolCall("claude-sdk", {
8005
+ tool: p.name,
8006
+ input: p.input,
8007
+ id: typeof p.id === "string" ? p.id : void 0
8008
+ })
8009
+ );
7825
8010
  }
7826
8011
  }
7827
8012
  return toolCalls;
@@ -8739,27 +8924,33 @@ ${basePrompt}` : basePrompt;
8739
8924
  }
8740
8925
  }
8741
8926
  if (itemType === "command_execution") {
8742
- completedToolCalls.push({
8743
- tool: "command_execution",
8744
- input: { command: item.command },
8745
- output: item.aggregated_output,
8746
- id: item.id
8747
- });
8927
+ completedToolCalls.push(
8928
+ normalizeToolCall("codex", {
8929
+ tool: "command_execution",
8930
+ input: { command: item.command },
8931
+ output: item.aggregated_output,
8932
+ id: item.id
8933
+ })
8934
+ );
8748
8935
  }
8749
8936
  if (itemType === "file_change") {
8750
- completedToolCalls.push({
8751
- tool: "file_change",
8752
- input: item.changes,
8753
- id: item.id
8754
- });
8937
+ completedToolCalls.push(
8938
+ normalizeToolCall("codex", {
8939
+ tool: "file_change",
8940
+ input: item.changes,
8941
+ id: item.id
8942
+ })
8943
+ );
8755
8944
  }
8756
8945
  if (itemType === "mcp_tool_call") {
8757
- completedToolCalls.push({
8758
- tool: `mcp:${item.server}/${item.tool}`,
8759
- input: item.arguments,
8760
- output: item.result ?? item.error,
8761
- id: item.id
8762
- });
8946
+ completedToolCalls.push(
8947
+ normalizeToolCall("codex", {
8948
+ tool: `mcp:${item.server}/${item.tool}`,
8949
+ input: item.arguments,
8950
+ output: item.result ?? item.error,
8951
+ id: item.id
8952
+ })
8953
+ );
8763
8954
  }
8764
8955
  }
8765
8956
  resolveCwd(cwdOverride) {
@@ -9299,12 +9490,6 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
9299
9490
  return logger;
9300
9491
  }
9301
9492
  handleEvent(eventType, data) {
9302
- if (this.format === "json") {
9303
- const elapsed2 = formatElapsed4(this.startedAt);
9304
- this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
9305
- `);
9306
- return;
9307
- }
9308
9493
  if (this.chunkExtractor) {
9309
9494
  const chunkText = this.chunkExtractor(eventType, data);
9310
9495
  if (chunkText === null) {
@@ -9317,6 +9502,12 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
9317
9502
  }
9318
9503
  this.flushPendingText();
9319
9504
  }
9505
+ if (this.format === "json") {
9506
+ const elapsed2 = formatElapsed4(this.startedAt);
9507
+ this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
9508
+ `);
9509
+ return;
9510
+ }
9320
9511
  const elapsed = formatElapsed4(this.startedAt);
9321
9512
  const summary = this.summarize(eventType, data);
9322
9513
  if (summary) {
@@ -9327,14 +9518,19 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
9327
9518
  flushPendingText() {
9328
9519
  if (!this.pendingText) return;
9329
9520
  const elapsed = formatElapsed4(this.startedAt);
9330
- this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
9521
+ if (this.format === "json") {
9522
+ this.stream.write(
9523
+ `${JSON.stringify({ time: elapsed, event: "assistant_message", data: { content: this.pendingText } })}
9524
+ `
9525
+ );
9526
+ } else {
9527
+ this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
9331
9528
  `);
9529
+ }
9332
9530
  this.pendingText = "";
9333
9531
  }
9334
9532
  async close() {
9335
- if (this.format !== "json") {
9336
- this.flushPendingText();
9337
- }
9533
+ this.flushPendingText();
9338
9534
  await new Promise((resolve, reject) => {
9339
9535
  this.stream.once("error", reject);
9340
9536
  this.stream.end(() => resolve());
@@ -9409,15 +9605,17 @@ var CopilotCliProvider = class {
9409
9605
  }
9410
9606
  if (update.status === "completed" || update.status === "failed") {
9411
9607
  const toolName = update.title ?? update.kind ?? "unknown";
9412
- completedToolCalls.push({
9413
- tool: toolName,
9414
- input: update.rawInput,
9415
- output: update.rawOutput,
9416
- id: callId,
9417
- startTime: (/* @__PURE__ */ new Date()).toISOString(),
9418
- endTime: (/* @__PURE__ */ new Date()).toISOString(),
9419
- durationMs: 0
9420
- });
9608
+ completedToolCalls.push(
9609
+ normalizeToolCall("copilot-cli", {
9610
+ tool: toolName,
9611
+ input: update.rawInput,
9612
+ output: update.rawOutput,
9613
+ id: callId,
9614
+ startTime: (/* @__PURE__ */ new Date()).toISOString(),
9615
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
9616
+ durationMs: 0
9617
+ })
9618
+ );
9421
9619
  request.streamCallbacks?.onToolCallEnd?.(
9422
9620
  toolName,
9423
9621
  update.rawInput,
@@ -9434,15 +9632,17 @@ var CopilotCliProvider = class {
9434
9632
  if (inProgress) {
9435
9633
  toolCallsInProgress.delete(callId);
9436
9634
  const duration = Date.now() - inProgress.startMs;
9437
- completedToolCalls.push({
9438
- tool: inProgress.tool,
9439
- input: inProgress.input,
9440
- output: update.rawOutput,
9441
- id: inProgress.id,
9442
- startTime: inProgress.startTime,
9443
- endTime: (/* @__PURE__ */ new Date()).toISOString(),
9444
- durationMs: duration
9445
- });
9635
+ completedToolCalls.push(
9636
+ normalizeToolCall("copilot-cli", {
9637
+ tool: inProgress.tool,
9638
+ input: inProgress.input,
9639
+ output: update.rawOutput,
9640
+ id: inProgress.id,
9641
+ startTime: inProgress.startTime,
9642
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
9643
+ durationMs: duration
9644
+ })
9645
+ );
9446
9646
  request.streamCallbacks?.onToolCallEnd?.(
9447
9647
  inProgress.tool,
9448
9648
  inProgress.input,
@@ -9788,11 +9988,13 @@ function parseCopilotEvents(eventsJsonl) {
9788
9988
  }
9789
9989
  case "assistant.message": {
9790
9990
  const toolRequests = data.toolRequests;
9791
- const toolCalls = (toolRequests ?? []).map((req) => ({
9792
- tool: String(req.name ?? req.toolName ?? ""),
9793
- input: req.arguments,
9794
- id: req.toolCallId ? String(req.toolCallId) : void 0
9795
- }));
9991
+ const toolCalls = (toolRequests ?? []).map(
9992
+ (req) => normalizeToolCall("copilot-log", {
9993
+ tool: String(req.name ?? req.toolName ?? ""),
9994
+ input: req.arguments,
9995
+ id: req.toolCallId ? String(req.toolCallId) : void 0
9996
+ })
9997
+ );
9796
9998
  messages.push({
9797
9999
  role: "assistant",
9798
10000
  content: data.content != null ? String(data.content) : void 0,
@@ -9832,12 +10034,12 @@ function parseCopilotEvents(eventsJsonl) {
9832
10034
  messages.push({
9833
10035
  role: "assistant",
9834
10036
  toolCalls: [
9835
- {
10037
+ normalizeToolCall("copilot-log", {
9836
10038
  tool: started.toolName,
9837
10039
  input: started.input,
9838
10040
  output: data.result,
9839
10041
  id: toolCallId
9840
- }
10042
+ })
9841
10043
  ]
9842
10044
  });
9843
10045
  }
@@ -10186,15 +10388,17 @@ var CopilotSdkProvider = class {
10186
10388
  if (inProgress) {
10187
10389
  toolCallsInProgress.delete(callId);
10188
10390
  const endMs = Date.now();
10189
- completedToolCalls.push({
10190
- tool: inProgress.tool,
10191
- input: inProgress.input,
10192
- output: data?.output ?? data?.result,
10193
- id: inProgress.id,
10194
- startTime: inProgress.startTime,
10195
- endTime: (/* @__PURE__ */ new Date()).toISOString(),
10196
- durationMs: endMs - inProgress.startMs
10197
- });
10391
+ completedToolCalls.push(
10392
+ normalizeToolCall("copilot-sdk", {
10393
+ tool: inProgress.tool,
10394
+ input: inProgress.input,
10395
+ output: data?.output ?? data?.result,
10396
+ id: inProgress.id,
10397
+ startTime: inProgress.startTime,
10398
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
10399
+ durationMs: endMs - inProgress.startMs
10400
+ })
10401
+ );
10198
10402
  }
10199
10403
  }
10200
10404
  if (eventType === "assistant.message") {
@@ -11178,12 +11382,14 @@ function extractToolCallsFromEvents(events) {
11178
11382
  }
11179
11383
  const toolCalls = [];
11180
11384
  for (const [id, { tool: tool2, input }] of starts) {
11181
- toolCalls.push({
11182
- tool: tool2,
11183
- input,
11184
- id: id.startsWith("anon-") ? void 0 : id,
11185
- output: results.get(id)
11186
- });
11385
+ toolCalls.push(
11386
+ normalizeToolCall("pi-cli", {
11387
+ tool: tool2,
11388
+ input,
11389
+ id: id.startsWith("anon-") ? void 0 : id,
11390
+ output: results.get(id)
11391
+ })
11392
+ );
11187
11393
  }
11188
11394
  return toolCalls;
11189
11395
  }
@@ -11305,17 +11511,21 @@ function extractToolCalls3(content) {
11305
11511
  if (!part || typeof part !== "object") continue;
11306
11512
  const p = part;
11307
11513
  if (p.type === "tool_use" && typeof p.name === "string") {
11308
- toolCalls.push({
11309
- tool: p.name,
11310
- input: p.input,
11311
- id: typeof p.id === "string" ? p.id : void 0
11312
- });
11514
+ toolCalls.push(
11515
+ normalizeToolCall("pi-cli", {
11516
+ tool: p.name,
11517
+ input: p.input,
11518
+ id: typeof p.id === "string" ? p.id : void 0
11519
+ })
11520
+ );
11313
11521
  } else if ((p.type === "toolCall" || p.type === "tool_call") && typeof p.name === "string") {
11314
- toolCalls.push({
11315
- tool: p.name,
11316
- input: p.arguments ?? p.input,
11317
- id: typeof p.id === "string" ? p.id : void 0
11318
- });
11522
+ toolCalls.push(
11523
+ normalizeToolCall("pi-cli", {
11524
+ tool: p.name,
11525
+ input: p.arguments ?? p.input,
11526
+ id: typeof p.id === "string" ? p.id : void 0
11527
+ })
11528
+ );
11319
11529
  } else if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
11320
11530
  const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
11321
11531
  if (existing) {
@@ -12776,6 +12986,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
12776
12986
  const logDirSource = target.log_dir ?? target.log_directory;
12777
12987
  const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
12778
12988
  const systemPromptSource = target.system_prompt;
12989
+ const streamLogResult = resolveStreamLog(target, env.AGENTV_CODEX_LOG_FORMAT);
12990
+ if (streamLogResult.deprecationWarning) {
12991
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
12992
+ `);
12993
+ }
12779
12994
  const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
12780
12995
  allowLiteral: true,
12781
12996
  optionalEnv: true
@@ -12822,6 +13037,7 @@ function resolveCodexConfig(target, env, evalFilePath) {
12822
13037
  timeoutMs,
12823
13038
  logDir,
12824
13039
  logFormat,
13040
+ streamLog: streamLogResult.streamLog,
12825
13041
  systemPrompt
12826
13042
  };
12827
13043
  }
@@ -12838,6 +13054,38 @@ function normalizeCodexLogFormat(value) {
12838
13054
  }
12839
13055
  throw new Error("codex log format must be 'summary' or 'json'");
12840
13056
  }
13057
+ function resolveStreamLog(target, envFallback) {
13058
+ if (target.stream_log !== void 0 && target.stream_log !== null) {
13059
+ const val = target.stream_log;
13060
+ if (val === false || val === "false") {
13061
+ return { streamLog: false, logFormat: void 0 };
13062
+ }
13063
+ if (val === "raw") {
13064
+ return { streamLog: "raw", logFormat: "json" };
13065
+ }
13066
+ if (val === "summary") {
13067
+ return { streamLog: "summary", logFormat: "summary" };
13068
+ }
13069
+ throw new Error(`${target.name}: stream_log must be false, 'raw', or 'summary'`);
13070
+ }
13071
+ const logFormatRaw = target.log_format ?? target.log_output_format ?? envFallback;
13072
+ if (logFormatRaw === void 0 || logFormatRaw === null) {
13073
+ return { streamLog: void 0, logFormat: void 0 };
13074
+ }
13075
+ if (typeof logFormatRaw !== "string") {
13076
+ throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
13077
+ }
13078
+ const normalized = logFormatRaw.trim().toLowerCase();
13079
+ if (normalized !== "json" && normalized !== "summary") {
13080
+ throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
13081
+ }
13082
+ const streamLogEquivalent = normalized === "json" ? "raw" : "summary";
13083
+ return {
13084
+ streamLog: streamLogEquivalent,
13085
+ logFormat: normalized,
13086
+ deprecationWarning: `${target.name}: 'log_format' is deprecated and will be removed in v4.16. Use 'stream_log: ${streamLogEquivalent}' instead (log_format: '${normalized}' \u2192 stream_log: '${streamLogEquivalent}').`
13087
+ };
13088
+ }
12841
13089
  function resolveCopilotSdkConfig(target, env, evalFilePath) {
12842
13090
  const cliUrlSource = target.cli_url;
12843
13091
  const cliPathSource = target.cli_path;
@@ -12849,6 +13097,11 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
12849
13097
  const logDirSource = target.log_dir ?? target.log_directory;
12850
13098
  const logFormatSource = target.log_format;
12851
13099
  const systemPromptSource = target.system_prompt;
13100
+ const streamLogResult = resolveStreamLog(target);
13101
+ if (streamLogResult.deprecationWarning) {
13102
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
13103
+ `);
13104
+ }
12852
13105
  const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
12853
13106
  allowLiteral: true,
12854
13107
  optionalEnv: true
@@ -12959,6 +13212,7 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
12959
13212
  timeoutMs,
12960
13213
  logDir,
12961
13214
  logFormat,
13215
+ streamLog: streamLogResult.streamLog,
12962
13216
  systemPrompt,
12963
13217
  byokType,
12964
13218
  byokBaseUrl,
@@ -12978,6 +13232,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
12978
13232
  const logDirSource = target.log_dir ?? target.log_directory;
12979
13233
  const logFormatSource = target.log_format;
12980
13234
  const systemPromptSource = target.system_prompt;
13235
+ const streamLogResult = resolveStreamLog(target);
13236
+ if (streamLogResult.deprecationWarning) {
13237
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
13238
+ `);
13239
+ }
12981
13240
  const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
12982
13241
  allowLiteral: true,
12983
13242
  optionalEnv: true
@@ -13029,6 +13288,7 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
13029
13288
  timeoutMs,
13030
13289
  logDir,
13031
13290
  logFormat,
13291
+ streamLog: streamLogResult.streamLog,
13032
13292
  systemPrompt
13033
13293
  };
13034
13294
  }
@@ -13051,6 +13311,11 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
13051
13311
  const logDirSource = target.log_dir ?? target.log_directory;
13052
13312
  const logFormatSource = target.log_format;
13053
13313
  const systemPromptSource = target.system_prompt;
13314
+ const streamLogResult = resolveStreamLog(target);
13315
+ if (streamLogResult.deprecationWarning) {
13316
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
13317
+ `);
13318
+ }
13054
13319
  const subprovider = resolveOptionalString(
13055
13320
  subproviderSource,
13056
13321
  env,
@@ -13121,6 +13386,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
13121
13386
  timeoutMs,
13122
13387
  logDir,
13123
13388
  logFormat,
13389
+ streamLog: streamLogResult.streamLog,
13124
13390
  systemPrompt
13125
13391
  };
13126
13392
  }
@@ -13137,6 +13403,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
13137
13403
  const logDirSource = target.log_dir ?? target.log_directory;
13138
13404
  const logFormatSource = target.log_format;
13139
13405
  const systemPromptSource = target.system_prompt;
13406
+ const streamLogResult = resolveStreamLog(target);
13407
+ if (streamLogResult.deprecationWarning) {
13408
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
13409
+ `);
13410
+ }
13140
13411
  const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
13141
13412
  allowLiteral: true,
13142
13413
  optionalEnv: true
@@ -13207,6 +13478,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
13207
13478
  timeoutMs,
13208
13479
  logDir,
13209
13480
  logFormat,
13481
+ streamLog: streamLogResult.streamLog,
13210
13482
  systemPrompt
13211
13483
  };
13212
13484
  }
@@ -13218,6 +13490,11 @@ function resolveClaudeConfig(target, env, evalFilePath) {
13218
13490
  const logDirSource = target.log_dir ?? target.log_directory;
13219
13491
  const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
13220
13492
  const systemPromptSource = target.system_prompt;
13493
+ const streamLogResult = resolveStreamLog(target);
13494
+ if (streamLogResult.deprecationWarning) {
13495
+ process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
13496
+ `);
13497
+ }
13221
13498
  const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
13222
13499
  allowLiteral: true,
13223
13500
  optionalEnv: true
@@ -13261,7 +13538,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
13261
13538
  maxTurns,
13262
13539
  maxBudgetUsd,
13263
13540
  logDir,
13264
- logFormat
13541
+ logFormat,
13542
+ streamLog: streamLogResult.streamLog
13265
13543
  };
13266
13544
  }
13267
13545
  function normalizeClaudeLogFormat(value) {
@@ -17946,100 +18224,35 @@ var LatencyEvaluator = class {
17946
18224
 
17947
18225
  // src/evaluation/evaluators/skill-trigger.ts
17948
18226
  init_cjs_shims();
17949
- var CLAUDE_MATCHER = {
17950
- skillTools: ["Skill"],
17951
- skillInputField: "skill",
17952
- readTools: ["Read"],
17953
- readInputField: "file_path"
17954
- };
17955
- var COPILOT_MATCHER = {
17956
- skillTools: ["Skill", "skill"],
17957
- skillInputField: "skill",
17958
- readTools: ["Read File", "readFile", "Read", "readTextFile"],
17959
- readInputField: "file_path",
17960
- skillToolPrefixes: ["Using skill: "],
17961
- readToolPrefixes: ["Viewing "],
17962
- readInputFields: ["file_path", "path"]
17963
- };
17964
- var PI_CODING_AGENT_MATCHER = {
17965
- skillTools: [],
17966
- skillInputField: "skill",
17967
- readTools: ["read"],
17968
- readInputField: "path",
17969
- readInputFields: ["path", "file_path", "filePath"]
17970
- };
17971
- var CODEX_MATCHER = {
17972
- skillTools: [],
17973
- skillInputField: "skill",
17974
- readTools: ["command_execution"],
17975
- readInputField: "command",
17976
- skillToolPrefixes: ["mcp:"],
17977
- readToolPrefixes: ["mcp:"],
17978
- readInputFields: ["command", "path", "file_path", "filePath"]
17979
- };
17980
- var PROVIDER_TOOL_SEMANTICS = {
17981
- claude: CLAUDE_MATCHER,
17982
- "claude-cli": CLAUDE_MATCHER,
17983
- "claude-sdk": CLAUDE_MATCHER,
17984
- codex: CODEX_MATCHER,
17985
- "pi-coding-agent": PI_CODING_AGENT_MATCHER,
17986
- "pi-cli": PI_CODING_AGENT_MATCHER,
17987
- "copilot-cli": COPILOT_MATCHER,
17988
- "copilot-log": COPILOT_MATCHER,
17989
- "copilot-sdk": COPILOT_MATCHER,
17990
- vscode: COPILOT_MATCHER,
17991
- "vscode-insiders": COPILOT_MATCHER
17992
- };
17993
18227
  var SkillTriggerEvaluator = class {
17994
18228
  kind = "skill-trigger";
17995
18229
  config;
17996
18230
  constructor(config) {
17997
18231
  this.config = config;
17998
18232
  }
17999
- resolveMatcher(providerKind) {
18000
- if (providerKind) {
18001
- const match = PROVIDER_TOOL_SEMANTICS[providerKind];
18002
- if (match) return match;
18003
- }
18004
- return CLAUDE_MATCHER;
18005
- }
18006
18233
  evaluate(context2) {
18007
18234
  const skillName = this.config.skill;
18008
18235
  const shouldTrigger = this.config.should_trigger !== false;
18009
- const providerKind = context2.provider?.kind;
18010
- const matcher = this.resolveMatcher(providerKind);
18011
18236
  const allToolCalls = (context2.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
18012
18237
  let triggered = false;
18013
18238
  let evidence = "";
18014
18239
  for (const toolCall of allToolCalls) {
18015
18240
  const toolName = toolCall.tool ?? "";
18016
18241
  const input = toolCall.input ?? {};
18017
- if (matcher.skillTools.includes(toolName)) {
18018
- const skillArg = String(input[matcher.skillInputField] ?? "");
18242
+ if (toolName === "Skill") {
18243
+ const skillArg = String(input.skill ?? "");
18019
18244
  if (skillArg.includes(skillName)) {
18020
18245
  triggered = true;
18021
- evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
18246
+ evidence = `Skill tool invoked with skill="${skillArg}"`;
18022
18247
  break;
18023
18248
  }
18024
- } else if (matcher.skillToolPrefixes?.some(
18025
- (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
18026
- )) {
18027
- triggered = true;
18028
- evidence = `Skill tool invoked via tool name "${toolName}"`;
18029
- break;
18030
- } else if (matcher.readTools.includes(toolName)) {
18031
- const filePath = this.readPathFromInput(input, matcher);
18032
- if (filePath.includes(skillName)) {
18249
+ } else if (toolName === "Read") {
18250
+ const filePath = String(input.file_path ?? "");
18251
+ if (filePath.includes(`skills/${skillName}/`)) {
18033
18252
  triggered = true;
18034
18253
  evidence = `Read tool loaded skill file: ${filePath}`;
18035
18254
  break;
18036
18255
  }
18037
- } else if (matcher.readToolPrefixes?.some(
18038
- (prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
18039
- )) {
18040
- triggered = true;
18041
- evidence = `Read tool loaded skill file via tool name "${toolName}"`;
18042
- break;
18043
18256
  }
18044
18257
  if (!triggered && toolCall.output != null) {
18045
18258
  const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
@@ -18076,16 +18289,6 @@ var SkillTriggerEvaluator = class {
18076
18289
  expectedAspectCount: 1
18077
18290
  };
18078
18291
  }
18079
- readPathFromInput(input, matcher) {
18080
- const fields = matcher.readInputFields ?? [matcher.readInputField];
18081
- for (const field of fields) {
18082
- const value = input[field];
18083
- if (value !== void 0 && value !== null) {
18084
- return String(value);
18085
- }
18086
- }
18087
- return "";
18088
- }
18089
18292
  };
18090
18293
 
18091
18294
  // src/evaluation/evaluators/llm-grader-prompt.ts
@@ -18935,10 +19138,12 @@ function runEqualsAssertion(output, value) {
18935
19138
 
18936
19139
  // src/evaluation/orchestrator.ts
18937
19140
  init_cjs_shims();
19141
+ var import_node_child_process11 = require("child_process");
18938
19142
  var import_node_crypto11 = require("crypto");
18939
19143
  var import_node_fs16 = require("fs");
18940
19144
  var import_promises36 = require("fs/promises");
18941
19145
  var import_node_path49 = __toESM(require("path"), 1);
19146
+ var import_node_util7 = require("util");
18942
19147
  var import_micromatch3 = __toESM(require("micromatch"), 1);
18943
19148
 
18944
19149
  // ../../node_modules/.bun/p-limit@6.2.0/node_modules/p-limit/index.js
@@ -20414,6 +20619,8 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
20414
20619
  }
20415
20620
 
20416
20621
  // src/evaluation/orchestrator.ts
20622
+ var execFileAsync3 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
20623
+ var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
20417
20624
  function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
20418
20625
  return score >= threshold ? "ok" : "quality_failure";
20419
20626
  }
@@ -20451,6 +20658,35 @@ function hasHookCommand(hook) {
20451
20658
  function hooksEnabled(workspace) {
20452
20659
  return workspace?.hooks?.enabled !== false;
20453
20660
  }
20661
+ function workspaceGitEnv() {
20662
+ const env = { ...process.env };
20663
+ for (const key of Object.keys(env)) {
20664
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
20665
+ delete env[key];
20666
+ }
20667
+ }
20668
+ return {
20669
+ ...env,
20670
+ GIT_TERMINAL_PROMPT: "0",
20671
+ GIT_ASKPASS: "",
20672
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
20673
+ };
20674
+ }
20675
+ async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
20676
+ if (!(0, import_node_fs16.existsSync)(import_node_path49.default.join(workspacePath, ".git"))) {
20677
+ return false;
20678
+ }
20679
+ const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
20680
+ const opts = {
20681
+ cwd: workspacePath,
20682
+ timeout: WORKSPACE_GIT_TIMEOUT_MS,
20683
+ env: workspaceGitEnv(),
20684
+ maxBuffer: 50 * 1024 * 1024
20685
+ };
20686
+ await execFileAsync3("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
20687
+ await execFileAsync3("git", ["clean", cleanFlag], opts);
20688
+ return true;
20689
+ }
20454
20690
  function getWorkspaceTemplate(target) {
20455
20691
  const config = target.config;
20456
20692
  if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
@@ -21712,6 +21948,37 @@ async function runEvalCase(options) {
21712
21948
  }
21713
21949
  }
21714
21950
  }
21951
+ let beforeEachNeedsFreshBaseline = false;
21952
+ if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
21953
+ try {
21954
+ if (repoManager && evalCase.workspace.repos?.length) {
21955
+ await repoManager.reset(
21956
+ evalCase.workspace.repos,
21957
+ workspacePath,
21958
+ evalCase.workspace.hooks.before_each.reset
21959
+ );
21960
+ } else {
21961
+ await resetWorkspaceRoot(
21962
+ workspacePath,
21963
+ evalCase.workspace.hooks.before_each.reset,
21964
+ sharedBaselineCommit
21965
+ );
21966
+ }
21967
+ } catch (error) {
21968
+ const message = error instanceof Error ? error.message : String(error);
21969
+ return buildErrorResult(
21970
+ evalCase,
21971
+ target.name,
21972
+ nowFn(),
21973
+ new Error(`before_each reset failed: ${message}`),
21974
+ promptInputs,
21975
+ provider,
21976
+ "setup",
21977
+ "script_error",
21978
+ verbose
21979
+ );
21980
+ }
21981
+ }
21715
21982
  const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
21716
21983
  if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
21717
21984
  const beforeEachHook = caseBeforeEachHook;
@@ -21728,6 +21995,7 @@ async function runEvalCase(options) {
21728
21995
  toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
21729
21996
  scriptContext
21730
21997
  );
21998
+ beforeEachNeedsFreshBaseline = true;
21731
21999
  } catch (error) {
21732
22000
  const message = error instanceof Error ? error.message : String(error);
21733
22001
  return buildErrorResult(
@@ -21743,7 +22011,7 @@ async function runEvalCase(options) {
21743
22011
  );
21744
22012
  }
21745
22013
  }
21746
- let baselineCommit = sharedBaselineCommit;
22014
+ let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
21747
22015
  if (!baselineCommit && workspacePath) {
21748
22016
  try {
21749
22017
  baselineCommit = await initializeBaseline(workspacePath);
@@ -21754,6 +22022,35 @@ async function runEvalCase(options) {
21754
22022
  }
21755
22023
  }
21756
22024
  }
22025
+ if (evalCase.mode === "conversation" && evalCase.turns?.length) {
22026
+ const conversationResult = await runConversationMode({
22027
+ evalCase,
22028
+ provider,
22029
+ target,
22030
+ evaluators,
22031
+ typeRegistry,
22032
+ graderProvider,
22033
+ promptInputs,
22034
+ nowFn,
22035
+ signal,
22036
+ workspacePath,
22037
+ caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
22038
+ agentTimeoutMs,
22039
+ streamCallbacks: options.streamCallbacks,
22040
+ verbose,
22041
+ threshold: evalCase.threshold ?? caseThreshold,
22042
+ targetResolver,
22043
+ availableTargets
22044
+ });
22045
+ if (workspacePath && !isSharedWorkspace) {
22046
+ const shouldRetain = conversationResult.executionStatus === "ok" ? retainOnSuccess === "keep" || keepWorkspaces : retainOnFailure === "keep" || !forceCleanup && !keepWorkspaces;
22047
+ if (!shouldRetain) {
22048
+ await cleanupWorkspace(workspacePath).catch(() => {
22049
+ });
22050
+ }
22051
+ }
22052
+ return conversationResult;
22053
+ }
21757
22054
  const caseStartMs = Date.now();
21758
22055
  const attemptBudget = (maxRetries ?? 0) + 1;
21759
22056
  let attempt = 0;
@@ -21868,13 +22165,21 @@ async function runEvalCase(options) {
21868
22165
  ${providerFileChanges}` : providerFileChanges;
21869
22166
  }
21870
22167
  const providerError = extractProviderError(providerResponse);
21871
- if (caseHooksEnabled && repoManager && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none" && evalCase.workspace.repos) {
22168
+ if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
21872
22169
  try {
21873
- await repoManager.reset(
21874
- evalCase.workspace.repos,
21875
- workspacePath,
21876
- evalCase.workspace.hooks.after_each.reset
21877
- );
22170
+ if (repoManager && evalCase.workspace.repos?.length) {
22171
+ await repoManager.reset(
22172
+ evalCase.workspace.repos,
22173
+ workspacePath,
22174
+ evalCase.workspace.hooks.after_each.reset
22175
+ );
22176
+ } else {
22177
+ await resetWorkspaceRoot(
22178
+ workspacePath,
22179
+ evalCase.workspace.hooks.after_each.reset,
22180
+ baselineCommit
22181
+ );
22182
+ }
21878
22183
  } catch {
21879
22184
  }
21880
22185
  }
@@ -22490,6 +22795,276 @@ function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
22490
22795
  "llm-grader": llmGrader
22491
22796
  };
22492
22797
  }
22798
+ async function runConversationMode(options) {
22799
+ const {
22800
+ evalCase,
22801
+ provider,
22802
+ target,
22803
+ evaluators,
22804
+ typeRegistry,
22805
+ graderProvider,
22806
+ promptInputs,
22807
+ nowFn,
22808
+ signal,
22809
+ workspacePath,
22810
+ caseWorkspaceFile,
22811
+ agentTimeoutMs,
22812
+ streamCallbacks,
22813
+ verbose,
22814
+ threshold,
22815
+ targetResolver,
22816
+ availableTargets
22817
+ } = options;
22818
+ const turns = evalCase.turns;
22819
+ const aggregation = evalCase.aggregation ?? "mean";
22820
+ const onTurnFailure = evalCase.on_turn_failure ?? "continue";
22821
+ const windowSize = evalCase.window_size;
22822
+ const history = [];
22823
+ for (const msg of evalCase.input) {
22824
+ const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
22825
+ history.push({ role: msg.role, content });
22826
+ }
22827
+ const turnScores = [];
22828
+ const allTurnScoreValues = [];
22829
+ let stopped = false;
22830
+ const caseStartMs = Date.now();
22831
+ for (let i = 0; i < turns.length; i++) {
22832
+ const turn = turns[i];
22833
+ const turnIndex = i + 1;
22834
+ if (stopped) {
22835
+ turnScores.push({
22836
+ name: `turn-${turnIndex}`,
22837
+ type: "rubrics",
22838
+ score: 0,
22839
+ verdict: "skip",
22840
+ assertions: [{ text: "Skipped due to previous turn failure", passed: false }]
22841
+ });
22842
+ allTurnScoreValues.push(0);
22843
+ continue;
22844
+ }
22845
+ const userContent = typeof turn.input === "string" ? turn.input : JSON.stringify(turn.input);
22846
+ history.push({ role: "user", content: userContent });
22847
+ const chatPromptForProvider = windowSize ? buildWindowedHistory(history, windowSize) : [...history];
22848
+ let response;
22849
+ try {
22850
+ response = await provider.invoke({
22851
+ question: userContent,
22852
+ chatPrompt: chatPromptForProvider,
22853
+ evalCaseId: `${evalCase.id}/turn-${turnIndex}`,
22854
+ signal,
22855
+ cwd: workspacePath,
22856
+ workspaceFile: caseWorkspaceFile,
22857
+ streamCallbacks
22858
+ });
22859
+ } catch (error) {
22860
+ const message = error instanceof Error ? error.message : String(error);
22861
+ turnScores.push({
22862
+ name: `turn-${turnIndex}`,
22863
+ type: "rubrics",
22864
+ score: 0,
22865
+ verdict: "fail",
22866
+ assertions: [{ text: `Provider error: ${message}`, passed: false }]
22867
+ });
22868
+ allTurnScoreValues.push(0);
22869
+ if (onTurnFailure === "stop") stopped = true;
22870
+ continue;
22871
+ }
22872
+ const assistantContent = extractLastAssistantContent(response.output);
22873
+ history.push({ role: "assistant", content: assistantContent });
22874
+ if (!turn.assertions?.length && !turn.expected_output) {
22875
+ turnScores.push({
22876
+ name: `turn-${turnIndex}`,
22877
+ type: "rubrics",
22878
+ score: 1,
22879
+ verdict: "pass",
22880
+ assertions: []
22881
+ });
22882
+ allTurnScoreValues.push(1);
22883
+ continue;
22884
+ }
22885
+ const turnAssertions = buildTurnAssertions(turn);
22886
+ const turnEvalCase = {
22887
+ ...evalCase,
22888
+ id: `${evalCase.id}/turn-${turnIndex}`,
22889
+ assertions: turnAssertions,
22890
+ input: buildTurnGraderInput(history, windowSize),
22891
+ expected_output: turn.expected_output ? [
22892
+ typeof turn.expected_output === "string" ? { content: turn.expected_output } : turn.expected_output
22893
+ ] : [],
22894
+ // Clear conversation fields to prevent recursion
22895
+ mode: void 0,
22896
+ turns: void 0
22897
+ };
22898
+ const turnResult = await evaluateCandidate({
22899
+ evalCase: turnEvalCase,
22900
+ candidate: assistantContent,
22901
+ target,
22902
+ provider,
22903
+ evaluators,
22904
+ typeRegistry,
22905
+ promptInputs: {
22906
+ question: buildConversationContext(history, windowSize),
22907
+ chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history]
22908
+ },
22909
+ nowFn,
22910
+ attempt: 0,
22911
+ graderProvider,
22912
+ agentTimeoutMs,
22913
+ output: response.output,
22914
+ verbose,
22915
+ threshold,
22916
+ targetResolver,
22917
+ availableTargets
22918
+ });
22919
+ const turnScore = turnResult.score;
22920
+ allTurnScoreValues.push(turnScore);
22921
+ turnScores.push({
22922
+ name: `turn-${turnIndex}`,
22923
+ type: "rubrics",
22924
+ score: turnScore,
22925
+ verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD),
22926
+ assertions: turnResult.assertions ? [...turnResult.assertions] : [],
22927
+ scores: turnResult.scores
22928
+ });
22929
+ if (onTurnFailure === "stop" && turnScore < (threshold ?? DEFAULT_THRESHOLD)) {
22930
+ stopped = true;
22931
+ }
22932
+ }
22933
+ let conversationScores = [];
22934
+ if (evalCase.assertions?.length) {
22935
+ const conversationEvalCase = {
22936
+ ...evalCase,
22937
+ id: `${evalCase.id}/conversation`,
22938
+ input: history.map((m) => ({
22939
+ role: m.role,
22940
+ content: m.content
22941
+ })),
22942
+ expected_output: [],
22943
+ mode: void 0,
22944
+ turns: void 0
22945
+ };
22946
+ const fullTranscript = history.map((m) => {
22947
+ const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
22948
+ return `${m.role}: ${content}`;
22949
+ }).join("\n\n");
22950
+ const conversationResult = await evaluateCandidate({
22951
+ evalCase: conversationEvalCase,
22952
+ candidate: fullTranscript,
22953
+ target,
22954
+ provider,
22955
+ evaluators,
22956
+ typeRegistry,
22957
+ promptInputs: {
22958
+ question: fullTranscript,
22959
+ chatPrompt: [...history]
22960
+ },
22961
+ nowFn,
22962
+ attempt: 0,
22963
+ graderProvider,
22964
+ agentTimeoutMs,
22965
+ verbose,
22966
+ threshold,
22967
+ targetResolver,
22968
+ availableTargets
22969
+ });
22970
+ conversationScores = [
22971
+ {
22972
+ name: "conversation",
22973
+ type: "rubrics",
22974
+ score: conversationResult.score,
22975
+ verdict: scoreToVerdict(
22976
+ conversationResult.score,
22977
+ threshold ?? DEFAULT_THRESHOLD
22978
+ ),
22979
+ assertions: conversationResult.assertions ? [...conversationResult.assertions] : [],
22980
+ scores: conversationResult.scores
22981
+ }
22982
+ ];
22983
+ }
22984
+ const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)];
22985
+ const finalScore = aggregateConversationScores(allScoreValues, aggregation);
22986
+ const allResultScores = [...turnScores, ...conversationScores];
22987
+ const outputMessages = history.map((m) => ({
22988
+ role: m.role,
22989
+ content: m.content
22990
+ }));
22991
+ const flatAssertions = allResultScores.flatMap((s) => [...s.assertions]);
22992
+ const totalDurationMs = Date.now() - caseStartMs;
22993
+ return {
22994
+ timestamp: nowFn().toISOString(),
22995
+ testId: evalCase.id,
22996
+ suite: evalCase.suite,
22997
+ category: evalCase.category,
22998
+ score: finalScore,
22999
+ assertions: flatAssertions,
23000
+ target: target.name,
23001
+ output: outputMessages,
23002
+ scores: allResultScores,
23003
+ executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
23004
+ input: evalCase.input.map((m) => ({
23005
+ role: m.role,
23006
+ content: typeof m.content === "string" ? m.content : JSON.stringify(m.content)
23007
+ })),
23008
+ evalRun: { durationMs: totalDurationMs }
23009
+ };
23010
+ }
23011
+ function buildWindowedHistory(history, windowSize) {
23012
+ const systemMessages = history.filter((m) => m.role === "system");
23013
+ const nonSystem = history.filter((m) => m.role !== "system");
23014
+ const windowed = nonSystem.slice(-windowSize * 2);
23015
+ return [...systemMessages, ...windowed];
23016
+ }
23017
+ function buildConversationContext(history, windowSize) {
23018
+ const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
23019
+ return msgs.map((m) => {
23020
+ const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
23021
+ return `${m.role}: ${content}`;
23022
+ }).join("\n\n");
23023
+ }
23024
+ function buildTurnGraderInput(history, windowSize) {
23025
+ const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
23026
+ return msgs.map((m) => ({
23027
+ role: m.role,
23028
+ content: m.content
23029
+ }));
23030
+ }
23031
+ function buildTurnAssertions(turn) {
23032
+ if (!turn.assertions?.length) return [];
23033
+ const stringCriteria = [];
23034
+ const structured = [];
23035
+ for (const a of turn.assertions) {
23036
+ if (typeof a === "string") {
23037
+ stringCriteria.push(a);
23038
+ } else {
23039
+ structured.push(a);
23040
+ }
23041
+ }
23042
+ const result = [];
23043
+ if (stringCriteria.length > 0) {
23044
+ result.push({
23045
+ name: "turn-rubrics",
23046
+ type: "llm-grader",
23047
+ rubrics: stringCriteria.map((text, idx) => ({
23048
+ id: `criterion-${idx + 1}`,
23049
+ outcome: text,
23050
+ weight: 1
23051
+ }))
23052
+ });
23053
+ }
23054
+ result.push(...structured);
23055
+ return result;
23056
+ }
23057
+ function aggregateConversationScores(scores, aggregation) {
23058
+ if (scores.length === 0) return 1;
23059
+ switch (aggregation) {
23060
+ case "min":
23061
+ return Math.min(...scores);
23062
+ case "max":
23063
+ return Math.max(...scores);
23064
+ default:
23065
+ return scores.reduce((sum, s) => sum + s, 0) / scores.length;
23066
+ }
23067
+ }
22493
23068
  async function invokeProvider(provider, options) {
22494
23069
  const {
22495
23070
  evalCase,
@@ -23222,13 +23797,13 @@ function shouldSkipCacheForTemperature(targetConfig) {
23222
23797
 
23223
23798
  // src/evaluation/results-repo.ts
23224
23799
  init_cjs_shims();
23225
- var import_node_child_process11 = require("child_process");
23800
+ var import_node_child_process12 = require("child_process");
23226
23801
  var import_node_fs18 = require("fs");
23227
23802
  var import_promises39 = require("fs/promises");
23228
23803
  var import_node_os9 = __toESM(require("os"), 1);
23229
23804
  var import_node_path53 = __toESM(require("path"), 1);
23230
- var import_node_util7 = require("util");
23231
- var execFileAsync3 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
23805
+ var import_node_util8 = require("util");
23806
+ var execFileAsync4 = (0, import_node_util8.promisify)(import_node_child_process12.execFile);
23232
23807
  function sanitizeRepoSlug(repo) {
23233
23808
  return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
23234
23809
  }
@@ -23279,7 +23854,7 @@ function writePersistedStatus(statusFile, status) {
23279
23854
  }
23280
23855
  async function runCommand(executable, args, options) {
23281
23856
  try {
23282
- const { stdout, stderr } = await execFileAsync3(executable, [...args], {
23857
+ const { stdout, stderr } = await execFileAsync4(executable, [...args], {
23283
23858
  cwd: options?.cwd,
23284
23859
  env: process.env
23285
23860
  });
@@ -24341,11 +24916,13 @@ function extractAssistantContent(content) {
24341
24916
  break;
24342
24917
  case "tool_use":
24343
24918
  if (block.name) {
24344
- toolCalls.push({
24345
- tool: block.name,
24346
- input: block.input,
24347
- id: block.id
24348
- });
24919
+ toolCalls.push(
24920
+ normalizeToolCall("claude", {
24921
+ tool: block.name,
24922
+ input: block.input,
24923
+ id: block.id
24924
+ })
24925
+ );
24349
24926
  }
24350
24927
  break;
24351
24928
  }
@@ -24438,7 +25015,11 @@ function parseCodexSession(jsonl) {
24438
25015
  } else {
24439
25016
  input = payload.arguments;
24440
25017
  }
24441
- const toolCall = { tool: toolName, input, id: callId };
25018
+ const toolCall = normalizeToolCall("codex", {
25019
+ tool: toolName,
25020
+ input,
25021
+ id: callId
25022
+ });
24442
25023
  const msgIdx = messages.length;
24443
25024
  messages.push({
24444
25025
  role: "assistant",
@@ -24462,7 +25043,11 @@ function parseCodexSession(jsonl) {
24462
25043
  } else {
24463
25044
  input = payload.arguments;
24464
25045
  }
24465
- const toolCall = { tool: toolName, input, id: callId };
25046
+ const toolCall = normalizeToolCall("codex", {
25047
+ tool: toolName,
25048
+ input,
25049
+ id: callId
25050
+ });
24466
25051
  const msgIdx = messages.length;
24467
25052
  messages.push({
24468
25053
  role: "assistant",