@agentv/core 4.14.0 → 4.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-A3HYVKTI.js → chunk-AOOU6PLC.js} +70 -2
- package/dist/chunk-AOOU6PLC.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +89 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +90 -12
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +774 -189
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +68 -14
- package/dist/index.d.ts +68 -14
- package/dist/index.js +705 -189
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-A3HYVKTI.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -5856,10 +5856,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
5856
5856
|
const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
|
|
5857
5857
|
const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
|
|
5858
5858
|
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
5859
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
|
|
5859
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0 || Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0;
|
|
5860
5860
|
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
5861
5861
|
logError3(
|
|
5862
|
-
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
|
|
5862
|
+
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`
|
|
5863
5863
|
);
|
|
5864
5864
|
continue;
|
|
5865
5865
|
}
|
|
@@ -5936,6 +5936,14 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
5936
5936
|
) : void 0;
|
|
5937
5937
|
const onDependencyFailureRaw = asString5(testCaseConfig.on_dependency_failure);
|
|
5938
5938
|
const onDependencyFailure = onDependencyFailureRaw === "skip" || onDependencyFailureRaw === "fail" || onDependencyFailureRaw === "run" ? onDependencyFailureRaw : void 0;
|
|
5939
|
+
const modeRaw = asString5(testCaseConfig.mode);
|
|
5940
|
+
const mode = modeRaw === "conversation" ? "conversation" : void 0;
|
|
5941
|
+
const turns = Array.isArray(testCaseConfig.turns) ? parseTurns(testCaseConfig.turns) : void 0;
|
|
5942
|
+
const aggregationRaw = asString5(testCaseConfig.aggregation);
|
|
5943
|
+
const aggregation = aggregationRaw === "mean" || aggregationRaw === "min" || aggregationRaw === "max" ? aggregationRaw : void 0;
|
|
5944
|
+
const onTurnFailureRaw = asString5(testCaseConfig.on_turn_failure);
|
|
5945
|
+
const onTurnFailure = onTurnFailureRaw === "continue" || onTurnFailureRaw === "stop" ? onTurnFailureRaw : void 0;
|
|
5946
|
+
const windowSize = typeof testCaseConfig.window_size === "number" && testCaseConfig.window_size >= 1 ? testCaseConfig.window_size : void 0;
|
|
5939
5947
|
const testCase = {
|
|
5940
5948
|
id,
|
|
5941
5949
|
suite: suiteName,
|
|
@@ -5954,6 +5962,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
5954
5962
|
metadata,
|
|
5955
5963
|
targets: caseTargets,
|
|
5956
5964
|
...caseThreshold !== void 0 ? { threshold: caseThreshold } : {},
|
|
5965
|
+
...mode ? { mode } : {},
|
|
5966
|
+
...turns && turns.length > 0 ? { turns } : {},
|
|
5967
|
+
...aggregation ? { aggregation } : {},
|
|
5968
|
+
...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
|
|
5969
|
+
...windowSize !== void 0 ? { window_size: windowSize } : {},
|
|
5957
5970
|
...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
|
|
5958
5971
|
...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
|
|
5959
5972
|
};
|
|
@@ -5971,6 +5984,25 @@ async function loadTestById(evalFilePath, repoRoot, evalId) {
|
|
|
5971
5984
|
return match;
|
|
5972
5985
|
}
|
|
5973
5986
|
var loadEvalCaseById = loadTestById;
|
|
5987
|
+
function parseTurns(rawTurns) {
|
|
5988
|
+
return rawTurns.map((rawTurn) => {
|
|
5989
|
+
const turn = rawTurn;
|
|
5990
|
+
const input = turn.input;
|
|
5991
|
+
const expectedOutput = turn.expected_output;
|
|
5992
|
+
let assertions;
|
|
5993
|
+
if (Array.isArray(turn.assertions)) {
|
|
5994
|
+
assertions = turn.assertions.map((a) => {
|
|
5995
|
+
if (typeof a === "string") return a;
|
|
5996
|
+
return a;
|
|
5997
|
+
});
|
|
5998
|
+
}
|
|
5999
|
+
return {
|
|
6000
|
+
input,
|
|
6001
|
+
...expectedOutput !== void 0 ? { expected_output: expectedOutput } : {},
|
|
6002
|
+
...assertions && assertions.length > 0 ? { assertions } : {}
|
|
6003
|
+
};
|
|
6004
|
+
});
|
|
6005
|
+
}
|
|
5974
6006
|
function parseCommandArray(source) {
|
|
5975
6007
|
if (typeof source === "string") {
|
|
5976
6008
|
const parts = source.trim().split(/\s+/);
|
|
@@ -7053,6 +7085,155 @@ function subscribeToClaudeLogEntries(listener) {
|
|
|
7053
7085
|
};
|
|
7054
7086
|
}
|
|
7055
7087
|
|
|
7088
|
+
// src/evaluation/providers/normalize-tool-call.ts
|
|
7089
|
+
init_cjs_shims();
|
|
7090
|
+
var TOOL_NAME_MAP = /* @__PURE__ */ new Map([
|
|
7091
|
+
// --- Claude (already canonical) ---
|
|
7092
|
+
["claude::Skill", "Skill"],
|
|
7093
|
+
["claude::Read", "Read"],
|
|
7094
|
+
["claude::Write", "Write"],
|
|
7095
|
+
["claude::Edit", "Edit"],
|
|
7096
|
+
["claude::Bash", "Bash"],
|
|
7097
|
+
["claude-cli::Skill", "Skill"],
|
|
7098
|
+
["claude-cli::Read", "Read"],
|
|
7099
|
+
["claude-cli::Write", "Write"],
|
|
7100
|
+
["claude-cli::Edit", "Edit"],
|
|
7101
|
+
["claude-cli::Bash", "Bash"],
|
|
7102
|
+
["claude-sdk::Skill", "Skill"],
|
|
7103
|
+
["claude-sdk::Read", "Read"],
|
|
7104
|
+
["claude-sdk::Write", "Write"],
|
|
7105
|
+
["claude-sdk::Edit", "Edit"],
|
|
7106
|
+
["claude-sdk::Bash", "Bash"],
|
|
7107
|
+
// --- Copilot ---
|
|
7108
|
+
["copilot-cli::Skill", "Skill"],
|
|
7109
|
+
["copilot-cli::skill", "Skill"],
|
|
7110
|
+
["copilot-cli::Read File", "Read"],
|
|
7111
|
+
["copilot-cli::readFile", "Read"],
|
|
7112
|
+
["copilot-cli::Read", "Read"],
|
|
7113
|
+
["copilot-cli::readTextFile", "Read"],
|
|
7114
|
+
["copilot-cli::writeTextFile", "Write"],
|
|
7115
|
+
["copilot-cli::Write File", "Write"],
|
|
7116
|
+
["copilot-cli::editFile", "Edit"],
|
|
7117
|
+
["copilot-cli::Edit File", "Edit"],
|
|
7118
|
+
["copilot-cli::runTerminalCommand", "Bash"],
|
|
7119
|
+
["copilot-sdk::Skill", "Skill"],
|
|
7120
|
+
["copilot-sdk::skill", "Skill"],
|
|
7121
|
+
["copilot-sdk::Read File", "Read"],
|
|
7122
|
+
["copilot-sdk::readFile", "Read"],
|
|
7123
|
+
["copilot-sdk::Read", "Read"],
|
|
7124
|
+
["copilot-sdk::readTextFile", "Read"],
|
|
7125
|
+
["copilot-sdk::writeTextFile", "Write"],
|
|
7126
|
+
["copilot-sdk::Write File", "Write"],
|
|
7127
|
+
["copilot-sdk::editFile", "Edit"],
|
|
7128
|
+
["copilot-sdk::Edit File", "Edit"],
|
|
7129
|
+
["copilot-sdk::runTerminalCommand", "Bash"],
|
|
7130
|
+
["copilot-log::Skill", "Skill"],
|
|
7131
|
+
["copilot-log::skill", "Skill"],
|
|
7132
|
+
["copilot-log::Read File", "Read"],
|
|
7133
|
+
["copilot-log::readFile", "Read"],
|
|
7134
|
+
["copilot-log::Read", "Read"],
|
|
7135
|
+
["copilot-log::readTextFile", "Read"],
|
|
7136
|
+
["copilot-log::writeTextFile", "Write"],
|
|
7137
|
+
["copilot-log::Write File", "Write"],
|
|
7138
|
+
["copilot-log::editFile", "Edit"],
|
|
7139
|
+
["copilot-log::Edit File", "Edit"],
|
|
7140
|
+
["copilot-log::runTerminalCommand", "Bash"],
|
|
7141
|
+
["vscode::Skill", "Skill"],
|
|
7142
|
+
["vscode::skill", "Skill"],
|
|
7143
|
+
["vscode::Read File", "Read"],
|
|
7144
|
+
["vscode::readFile", "Read"],
|
|
7145
|
+
["vscode::Read", "Read"],
|
|
7146
|
+
["vscode::readTextFile", "Read"],
|
|
7147
|
+
["vscode::writeTextFile", "Write"],
|
|
7148
|
+
["vscode::Write File", "Write"],
|
|
7149
|
+
["vscode::editFile", "Edit"],
|
|
7150
|
+
["vscode::Edit File", "Edit"],
|
|
7151
|
+
["vscode::runTerminalCommand", "Bash"],
|
|
7152
|
+
["vscode-insiders::Skill", "Skill"],
|
|
7153
|
+
["vscode-insiders::skill", "Skill"],
|
|
7154
|
+
["vscode-insiders::Read File", "Read"],
|
|
7155
|
+
["vscode-insiders::readFile", "Read"],
|
|
7156
|
+
["vscode-insiders::Read", "Read"],
|
|
7157
|
+
["vscode-insiders::readTextFile", "Read"],
|
|
7158
|
+
["vscode-insiders::writeTextFile", "Write"],
|
|
7159
|
+
["vscode-insiders::Write File", "Write"],
|
|
7160
|
+
["vscode-insiders::editFile", "Edit"],
|
|
7161
|
+
["vscode-insiders::Edit File", "Edit"],
|
|
7162
|
+
["vscode-insiders::runTerminalCommand", "Bash"],
|
|
7163
|
+
// --- Codex ---
|
|
7164
|
+
["codex::command_execution", "Bash"],
|
|
7165
|
+
["codex::file_change", "Edit"],
|
|
7166
|
+
// --- Pi ---
|
|
7167
|
+
["pi-coding-agent::read", "Read"],
|
|
7168
|
+
["pi-coding-agent::bash", "Bash"],
|
|
7169
|
+
["pi-cli::read", "Read"],
|
|
7170
|
+
["pi-cli::bash", "Bash"]
|
|
7171
|
+
]);
|
|
7172
|
+
var COPILOT_PREFIXES = [
|
|
7173
|
+
{ prefix: "Using skill: ", canonical: "Skill", extractSkillFromName: true },
|
|
7174
|
+
{ prefix: "Viewing ", canonical: "Read" }
|
|
7175
|
+
];
|
|
7176
|
+
var CODEX_PREFIXES = [
|
|
7177
|
+
{ prefix: "mcp:", canonical: "Skill", extractSkillFromName: true }
|
|
7178
|
+
];
|
|
7179
|
+
var TOOL_PREFIX_MAP = /* @__PURE__ */ new Map([
|
|
7180
|
+
["copilot-cli", COPILOT_PREFIXES],
|
|
7181
|
+
["copilot-sdk", COPILOT_PREFIXES],
|
|
7182
|
+
["copilot-log", COPILOT_PREFIXES],
|
|
7183
|
+
["vscode", COPILOT_PREFIXES],
|
|
7184
|
+
["vscode-insiders", COPILOT_PREFIXES],
|
|
7185
|
+
["codex", CODEX_PREFIXES]
|
|
7186
|
+
]);
|
|
7187
|
+
var normalizeSkillInput = (input) => {
|
|
7188
|
+
if (input.skill !== void 0) return input;
|
|
7189
|
+
return input;
|
|
7190
|
+
};
|
|
7191
|
+
var normalizeReadInput = (input) => {
|
|
7192
|
+
if (input.file_path !== void 0) return input;
|
|
7193
|
+
if (input.path !== void 0) return { ...input, file_path: input.path };
|
|
7194
|
+
if (input.filePath !== void 0) return { ...input, file_path: input.filePath };
|
|
7195
|
+
return input;
|
|
7196
|
+
};
|
|
7197
|
+
var INPUT_NORMALIZERS = /* @__PURE__ */ new Map([
|
|
7198
|
+
["Skill", normalizeSkillInput],
|
|
7199
|
+
["Read", normalizeReadInput]
|
|
7200
|
+
]);
|
|
7201
|
+
function normalizeToolCall(providerKind, tc) {
|
|
7202
|
+
const nativeName = tc.tool;
|
|
7203
|
+
const exactKey = `${providerKind}::${nativeName}`;
|
|
7204
|
+
const canonical = TOOL_NAME_MAP.get(exactKey);
|
|
7205
|
+
if (canonical) {
|
|
7206
|
+
return applyInputNormalization(canonical, { ...tc, tool: canonical });
|
|
7207
|
+
}
|
|
7208
|
+
const prefixRules = TOOL_PREFIX_MAP.get(providerKind);
|
|
7209
|
+
if (prefixRules) {
|
|
7210
|
+
for (const rule of prefixRules) {
|
|
7211
|
+
if (nativeName.startsWith(rule.prefix)) {
|
|
7212
|
+
const suffix = nativeName.slice(rule.prefix.length);
|
|
7213
|
+
let normalizedInput = tc.input;
|
|
7214
|
+
if (rule.extractSkillFromName && suffix) {
|
|
7215
|
+
const existingInput = tc.input ?? {};
|
|
7216
|
+
normalizedInput = { ...existingInput, skill: suffix };
|
|
7217
|
+
}
|
|
7218
|
+
const normalized = {
|
|
7219
|
+
...tc,
|
|
7220
|
+
tool: rule.canonical,
|
|
7221
|
+
input: normalizedInput
|
|
7222
|
+
};
|
|
7223
|
+
return applyInputNormalization(rule.canonical, normalized);
|
|
7224
|
+
}
|
|
7225
|
+
}
|
|
7226
|
+
}
|
|
7227
|
+
return tc;
|
|
7228
|
+
}
|
|
7229
|
+
function applyInputNormalization(canonical, tc) {
|
|
7230
|
+
const normalizer = INPUT_NORMALIZERS.get(canonical);
|
|
7231
|
+
if (!normalizer || tc.input === void 0 || tc.input === null) return tc;
|
|
7232
|
+
const input = tc.input;
|
|
7233
|
+
const normalized = normalizer(input);
|
|
7234
|
+
return normalized === input ? tc : { ...tc, input: normalized };
|
|
7235
|
+
}
|
|
7236
|
+
|
|
7056
7237
|
// src/evaluation/providers/preread.ts
|
|
7057
7238
|
init_cjs_shims();
|
|
7058
7239
|
var import_node_path12 = __toESM(require("path"), 1);
|
|
@@ -7521,11 +7702,13 @@ function extractToolCalls(content) {
|
|
|
7521
7702
|
}
|
|
7522
7703
|
const p = part;
|
|
7523
7704
|
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
7524
|
-
toolCalls.push(
|
|
7525
|
-
|
|
7526
|
-
|
|
7527
|
-
|
|
7528
|
-
|
|
7705
|
+
toolCalls.push(
|
|
7706
|
+
normalizeToolCall("claude-cli", {
|
|
7707
|
+
tool: p.name,
|
|
7708
|
+
input: p.input,
|
|
7709
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
7710
|
+
})
|
|
7711
|
+
);
|
|
7529
7712
|
}
|
|
7530
7713
|
}
|
|
7531
7714
|
return toolCalls;
|
|
@@ -7817,11 +8000,13 @@ function extractToolCalls2(content) {
|
|
|
7817
8000
|
}
|
|
7818
8001
|
const p = part;
|
|
7819
8002
|
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
7820
|
-
toolCalls.push(
|
|
7821
|
-
|
|
7822
|
-
|
|
7823
|
-
|
|
7824
|
-
|
|
8003
|
+
toolCalls.push(
|
|
8004
|
+
normalizeToolCall("claude-sdk", {
|
|
8005
|
+
tool: p.name,
|
|
8006
|
+
input: p.input,
|
|
8007
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
8008
|
+
})
|
|
8009
|
+
);
|
|
7825
8010
|
}
|
|
7826
8011
|
}
|
|
7827
8012
|
return toolCalls;
|
|
@@ -8739,27 +8924,33 @@ ${basePrompt}` : basePrompt;
|
|
|
8739
8924
|
}
|
|
8740
8925
|
}
|
|
8741
8926
|
if (itemType === "command_execution") {
|
|
8742
|
-
completedToolCalls.push(
|
|
8743
|
-
|
|
8744
|
-
|
|
8745
|
-
|
|
8746
|
-
|
|
8747
|
-
|
|
8927
|
+
completedToolCalls.push(
|
|
8928
|
+
normalizeToolCall("codex", {
|
|
8929
|
+
tool: "command_execution",
|
|
8930
|
+
input: { command: item.command },
|
|
8931
|
+
output: item.aggregated_output,
|
|
8932
|
+
id: item.id
|
|
8933
|
+
})
|
|
8934
|
+
);
|
|
8748
8935
|
}
|
|
8749
8936
|
if (itemType === "file_change") {
|
|
8750
|
-
completedToolCalls.push(
|
|
8751
|
-
|
|
8752
|
-
|
|
8753
|
-
|
|
8754
|
-
|
|
8937
|
+
completedToolCalls.push(
|
|
8938
|
+
normalizeToolCall("codex", {
|
|
8939
|
+
tool: "file_change",
|
|
8940
|
+
input: item.changes,
|
|
8941
|
+
id: item.id
|
|
8942
|
+
})
|
|
8943
|
+
);
|
|
8755
8944
|
}
|
|
8756
8945
|
if (itemType === "mcp_tool_call") {
|
|
8757
|
-
completedToolCalls.push(
|
|
8758
|
-
|
|
8759
|
-
|
|
8760
|
-
|
|
8761
|
-
|
|
8762
|
-
|
|
8946
|
+
completedToolCalls.push(
|
|
8947
|
+
normalizeToolCall("codex", {
|
|
8948
|
+
tool: `mcp:${item.server}/${item.tool}`,
|
|
8949
|
+
input: item.arguments,
|
|
8950
|
+
output: item.result ?? item.error,
|
|
8951
|
+
id: item.id
|
|
8952
|
+
})
|
|
8953
|
+
);
|
|
8763
8954
|
}
|
|
8764
8955
|
}
|
|
8765
8956
|
resolveCwd(cwdOverride) {
|
|
@@ -9299,12 +9490,6 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
|
|
|
9299
9490
|
return logger;
|
|
9300
9491
|
}
|
|
9301
9492
|
handleEvent(eventType, data) {
|
|
9302
|
-
if (this.format === "json") {
|
|
9303
|
-
const elapsed2 = formatElapsed4(this.startedAt);
|
|
9304
|
-
this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
|
|
9305
|
-
`);
|
|
9306
|
-
return;
|
|
9307
|
-
}
|
|
9308
9493
|
if (this.chunkExtractor) {
|
|
9309
9494
|
const chunkText = this.chunkExtractor(eventType, data);
|
|
9310
9495
|
if (chunkText === null) {
|
|
@@ -9317,6 +9502,12 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
|
|
|
9317
9502
|
}
|
|
9318
9503
|
this.flushPendingText();
|
|
9319
9504
|
}
|
|
9505
|
+
if (this.format === "json") {
|
|
9506
|
+
const elapsed2 = formatElapsed4(this.startedAt);
|
|
9507
|
+
this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
|
|
9508
|
+
`);
|
|
9509
|
+
return;
|
|
9510
|
+
}
|
|
9320
9511
|
const elapsed = formatElapsed4(this.startedAt);
|
|
9321
9512
|
const summary = this.summarize(eventType, data);
|
|
9322
9513
|
if (summary) {
|
|
@@ -9327,14 +9518,19 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
|
|
|
9327
9518
|
flushPendingText() {
|
|
9328
9519
|
if (!this.pendingText) return;
|
|
9329
9520
|
const elapsed = formatElapsed4(this.startedAt);
|
|
9330
|
-
this.
|
|
9521
|
+
if (this.format === "json") {
|
|
9522
|
+
this.stream.write(
|
|
9523
|
+
`${JSON.stringify({ time: elapsed, event: "assistant_message", data: { content: this.pendingText } })}
|
|
9524
|
+
`
|
|
9525
|
+
);
|
|
9526
|
+
} else {
|
|
9527
|
+
this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
|
|
9331
9528
|
`);
|
|
9529
|
+
}
|
|
9332
9530
|
this.pendingText = "";
|
|
9333
9531
|
}
|
|
9334
9532
|
async close() {
|
|
9335
|
-
|
|
9336
|
-
this.flushPendingText();
|
|
9337
|
-
}
|
|
9533
|
+
this.flushPendingText();
|
|
9338
9534
|
await new Promise((resolve, reject) => {
|
|
9339
9535
|
this.stream.once("error", reject);
|
|
9340
9536
|
this.stream.end(() => resolve());
|
|
@@ -9409,15 +9605,17 @@ var CopilotCliProvider = class {
|
|
|
9409
9605
|
}
|
|
9410
9606
|
if (update.status === "completed" || update.status === "failed") {
|
|
9411
9607
|
const toolName = update.title ?? update.kind ?? "unknown";
|
|
9412
|
-
completedToolCalls.push(
|
|
9413
|
-
|
|
9414
|
-
|
|
9415
|
-
|
|
9416
|
-
|
|
9417
|
-
|
|
9418
|
-
|
|
9419
|
-
|
|
9420
|
-
|
|
9608
|
+
completedToolCalls.push(
|
|
9609
|
+
normalizeToolCall("copilot-cli", {
|
|
9610
|
+
tool: toolName,
|
|
9611
|
+
input: update.rawInput,
|
|
9612
|
+
output: update.rawOutput,
|
|
9613
|
+
id: callId,
|
|
9614
|
+
startTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
9615
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
9616
|
+
durationMs: 0
|
|
9617
|
+
})
|
|
9618
|
+
);
|
|
9421
9619
|
request.streamCallbacks?.onToolCallEnd?.(
|
|
9422
9620
|
toolName,
|
|
9423
9621
|
update.rawInput,
|
|
@@ -9434,15 +9632,17 @@ var CopilotCliProvider = class {
|
|
|
9434
9632
|
if (inProgress) {
|
|
9435
9633
|
toolCallsInProgress.delete(callId);
|
|
9436
9634
|
const duration = Date.now() - inProgress.startMs;
|
|
9437
|
-
completedToolCalls.push(
|
|
9438
|
-
|
|
9439
|
-
|
|
9440
|
-
|
|
9441
|
-
|
|
9442
|
-
|
|
9443
|
-
|
|
9444
|
-
|
|
9445
|
-
|
|
9635
|
+
completedToolCalls.push(
|
|
9636
|
+
normalizeToolCall("copilot-cli", {
|
|
9637
|
+
tool: inProgress.tool,
|
|
9638
|
+
input: inProgress.input,
|
|
9639
|
+
output: update.rawOutput,
|
|
9640
|
+
id: inProgress.id,
|
|
9641
|
+
startTime: inProgress.startTime,
|
|
9642
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
9643
|
+
durationMs: duration
|
|
9644
|
+
})
|
|
9645
|
+
);
|
|
9446
9646
|
request.streamCallbacks?.onToolCallEnd?.(
|
|
9447
9647
|
inProgress.tool,
|
|
9448
9648
|
inProgress.input,
|
|
@@ -9788,11 +9988,13 @@ function parseCopilotEvents(eventsJsonl) {
|
|
|
9788
9988
|
}
|
|
9789
9989
|
case "assistant.message": {
|
|
9790
9990
|
const toolRequests = data.toolRequests;
|
|
9791
|
-
const toolCalls = (toolRequests ?? []).map(
|
|
9792
|
-
|
|
9793
|
-
|
|
9794
|
-
|
|
9795
|
-
|
|
9991
|
+
const toolCalls = (toolRequests ?? []).map(
|
|
9992
|
+
(req) => normalizeToolCall("copilot-log", {
|
|
9993
|
+
tool: String(req.name ?? req.toolName ?? ""),
|
|
9994
|
+
input: req.arguments,
|
|
9995
|
+
id: req.toolCallId ? String(req.toolCallId) : void 0
|
|
9996
|
+
})
|
|
9997
|
+
);
|
|
9796
9998
|
messages.push({
|
|
9797
9999
|
role: "assistant",
|
|
9798
10000
|
content: data.content != null ? String(data.content) : void 0,
|
|
@@ -9832,12 +10034,12 @@ function parseCopilotEvents(eventsJsonl) {
|
|
|
9832
10034
|
messages.push({
|
|
9833
10035
|
role: "assistant",
|
|
9834
10036
|
toolCalls: [
|
|
9835
|
-
{
|
|
10037
|
+
normalizeToolCall("copilot-log", {
|
|
9836
10038
|
tool: started.toolName,
|
|
9837
10039
|
input: started.input,
|
|
9838
10040
|
output: data.result,
|
|
9839
10041
|
id: toolCallId
|
|
9840
|
-
}
|
|
10042
|
+
})
|
|
9841
10043
|
]
|
|
9842
10044
|
});
|
|
9843
10045
|
}
|
|
@@ -10186,15 +10388,17 @@ var CopilotSdkProvider = class {
|
|
|
10186
10388
|
if (inProgress) {
|
|
10187
10389
|
toolCallsInProgress.delete(callId);
|
|
10188
10390
|
const endMs = Date.now();
|
|
10189
|
-
completedToolCalls.push(
|
|
10190
|
-
|
|
10191
|
-
|
|
10192
|
-
|
|
10193
|
-
|
|
10194
|
-
|
|
10195
|
-
|
|
10196
|
-
|
|
10197
|
-
|
|
10391
|
+
completedToolCalls.push(
|
|
10392
|
+
normalizeToolCall("copilot-sdk", {
|
|
10393
|
+
tool: inProgress.tool,
|
|
10394
|
+
input: inProgress.input,
|
|
10395
|
+
output: data?.output ?? data?.result,
|
|
10396
|
+
id: inProgress.id,
|
|
10397
|
+
startTime: inProgress.startTime,
|
|
10398
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10399
|
+
durationMs: endMs - inProgress.startMs
|
|
10400
|
+
})
|
|
10401
|
+
);
|
|
10198
10402
|
}
|
|
10199
10403
|
}
|
|
10200
10404
|
if (eventType === "assistant.message") {
|
|
@@ -11178,12 +11382,14 @@ function extractToolCallsFromEvents(events) {
|
|
|
11178
11382
|
}
|
|
11179
11383
|
const toolCalls = [];
|
|
11180
11384
|
for (const [id, { tool: tool2, input }] of starts) {
|
|
11181
|
-
toolCalls.push(
|
|
11182
|
-
|
|
11183
|
-
|
|
11184
|
-
|
|
11185
|
-
|
|
11186
|
-
|
|
11385
|
+
toolCalls.push(
|
|
11386
|
+
normalizeToolCall("pi-cli", {
|
|
11387
|
+
tool: tool2,
|
|
11388
|
+
input,
|
|
11389
|
+
id: id.startsWith("anon-") ? void 0 : id,
|
|
11390
|
+
output: results.get(id)
|
|
11391
|
+
})
|
|
11392
|
+
);
|
|
11187
11393
|
}
|
|
11188
11394
|
return toolCalls;
|
|
11189
11395
|
}
|
|
@@ -11305,17 +11511,21 @@ function extractToolCalls3(content) {
|
|
|
11305
11511
|
if (!part || typeof part !== "object") continue;
|
|
11306
11512
|
const p = part;
|
|
11307
11513
|
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
11308
|
-
toolCalls.push(
|
|
11309
|
-
|
|
11310
|
-
|
|
11311
|
-
|
|
11312
|
-
|
|
11514
|
+
toolCalls.push(
|
|
11515
|
+
normalizeToolCall("pi-cli", {
|
|
11516
|
+
tool: p.name,
|
|
11517
|
+
input: p.input,
|
|
11518
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
11519
|
+
})
|
|
11520
|
+
);
|
|
11313
11521
|
} else if ((p.type === "toolCall" || p.type === "tool_call") && typeof p.name === "string") {
|
|
11314
|
-
toolCalls.push(
|
|
11315
|
-
|
|
11316
|
-
|
|
11317
|
-
|
|
11318
|
-
|
|
11522
|
+
toolCalls.push(
|
|
11523
|
+
normalizeToolCall("pi-cli", {
|
|
11524
|
+
tool: p.name,
|
|
11525
|
+
input: p.arguments ?? p.input,
|
|
11526
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
11527
|
+
})
|
|
11528
|
+
);
|
|
11319
11529
|
} else if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
11320
11530
|
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
|
|
11321
11531
|
if (existing) {
|
|
@@ -12776,6 +12986,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
|
|
|
12776
12986
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
12777
12987
|
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
12778
12988
|
const systemPromptSource = target.system_prompt;
|
|
12989
|
+
const streamLogResult = resolveStreamLog(target, env.AGENTV_CODEX_LOG_FORMAT);
|
|
12990
|
+
if (streamLogResult.deprecationWarning) {
|
|
12991
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
12992
|
+
`);
|
|
12993
|
+
}
|
|
12779
12994
|
const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
|
|
12780
12995
|
allowLiteral: true,
|
|
12781
12996
|
optionalEnv: true
|
|
@@ -12822,6 +13037,7 @@ function resolveCodexConfig(target, env, evalFilePath) {
|
|
|
12822
13037
|
timeoutMs,
|
|
12823
13038
|
logDir,
|
|
12824
13039
|
logFormat,
|
|
13040
|
+
streamLog: streamLogResult.streamLog,
|
|
12825
13041
|
systemPrompt
|
|
12826
13042
|
};
|
|
12827
13043
|
}
|
|
@@ -12838,6 +13054,38 @@ function normalizeCodexLogFormat(value) {
|
|
|
12838
13054
|
}
|
|
12839
13055
|
throw new Error("codex log format must be 'summary' or 'json'");
|
|
12840
13056
|
}
|
|
13057
|
+
function resolveStreamLog(target, envFallback) {
|
|
13058
|
+
if (target.stream_log !== void 0 && target.stream_log !== null) {
|
|
13059
|
+
const val = target.stream_log;
|
|
13060
|
+
if (val === false || val === "false") {
|
|
13061
|
+
return { streamLog: false, logFormat: void 0 };
|
|
13062
|
+
}
|
|
13063
|
+
if (val === "raw") {
|
|
13064
|
+
return { streamLog: "raw", logFormat: "json" };
|
|
13065
|
+
}
|
|
13066
|
+
if (val === "summary") {
|
|
13067
|
+
return { streamLog: "summary", logFormat: "summary" };
|
|
13068
|
+
}
|
|
13069
|
+
throw new Error(`${target.name}: stream_log must be false, 'raw', or 'summary'`);
|
|
13070
|
+
}
|
|
13071
|
+
const logFormatRaw = target.log_format ?? target.log_output_format ?? envFallback;
|
|
13072
|
+
if (logFormatRaw === void 0 || logFormatRaw === null) {
|
|
13073
|
+
return { streamLog: void 0, logFormat: void 0 };
|
|
13074
|
+
}
|
|
13075
|
+
if (typeof logFormatRaw !== "string") {
|
|
13076
|
+
throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
|
|
13077
|
+
}
|
|
13078
|
+
const normalized = logFormatRaw.trim().toLowerCase();
|
|
13079
|
+
if (normalized !== "json" && normalized !== "summary") {
|
|
13080
|
+
throw new Error(`${target.name}: log_format must be 'summary' or 'json'`);
|
|
13081
|
+
}
|
|
13082
|
+
const streamLogEquivalent = normalized === "json" ? "raw" : "summary";
|
|
13083
|
+
return {
|
|
13084
|
+
streamLog: streamLogEquivalent,
|
|
13085
|
+
logFormat: normalized,
|
|
13086
|
+
deprecationWarning: `${target.name}: 'log_format' is deprecated and will be removed in v4.16. Use 'stream_log: ${streamLogEquivalent}' instead (log_format: '${normalized}' \u2192 stream_log: '${streamLogEquivalent}').`
|
|
13087
|
+
};
|
|
13088
|
+
}
|
|
12841
13089
|
function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
12842
13090
|
const cliUrlSource = target.cli_url;
|
|
12843
13091
|
const cliPathSource = target.cli_path;
|
|
@@ -12849,6 +13097,11 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
|
12849
13097
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
12850
13098
|
const logFormatSource = target.log_format;
|
|
12851
13099
|
const systemPromptSource = target.system_prompt;
|
|
13100
|
+
const streamLogResult = resolveStreamLog(target);
|
|
13101
|
+
if (streamLogResult.deprecationWarning) {
|
|
13102
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
13103
|
+
`);
|
|
13104
|
+
}
|
|
12852
13105
|
const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
|
|
12853
13106
|
allowLiteral: true,
|
|
12854
13107
|
optionalEnv: true
|
|
@@ -12959,6 +13212,7 @@ function resolveCopilotSdkConfig(target, env, evalFilePath) {
|
|
|
12959
13212
|
timeoutMs,
|
|
12960
13213
|
logDir,
|
|
12961
13214
|
logFormat,
|
|
13215
|
+
streamLog: streamLogResult.streamLog,
|
|
12962
13216
|
systemPrompt,
|
|
12963
13217
|
byokType,
|
|
12964
13218
|
byokBaseUrl,
|
|
@@ -12978,6 +13232,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
|
|
|
12978
13232
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
12979
13233
|
const logFormatSource = target.log_format;
|
|
12980
13234
|
const systemPromptSource = target.system_prompt;
|
|
13235
|
+
const streamLogResult = resolveStreamLog(target);
|
|
13236
|
+
if (streamLogResult.deprecationWarning) {
|
|
13237
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
13238
|
+
`);
|
|
13239
|
+
}
|
|
12981
13240
|
const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
|
|
12982
13241
|
allowLiteral: true,
|
|
12983
13242
|
optionalEnv: true
|
|
@@ -13029,6 +13288,7 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
|
|
|
13029
13288
|
timeoutMs,
|
|
13030
13289
|
logDir,
|
|
13031
13290
|
logFormat,
|
|
13291
|
+
streamLog: streamLogResult.streamLog,
|
|
13032
13292
|
systemPrompt
|
|
13033
13293
|
};
|
|
13034
13294
|
}
|
|
@@ -13051,6 +13311,11 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
13051
13311
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
13052
13312
|
const logFormatSource = target.log_format;
|
|
13053
13313
|
const systemPromptSource = target.system_prompt;
|
|
13314
|
+
const streamLogResult = resolveStreamLog(target);
|
|
13315
|
+
if (streamLogResult.deprecationWarning) {
|
|
13316
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
13317
|
+
`);
|
|
13318
|
+
}
|
|
13054
13319
|
const subprovider = resolveOptionalString(
|
|
13055
13320
|
subproviderSource,
|
|
13056
13321
|
env,
|
|
@@ -13121,6 +13386,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
|
|
|
13121
13386
|
timeoutMs,
|
|
13122
13387
|
logDir,
|
|
13123
13388
|
logFormat,
|
|
13389
|
+
streamLog: streamLogResult.streamLog,
|
|
13124
13390
|
systemPrompt
|
|
13125
13391
|
};
|
|
13126
13392
|
}
|
|
@@ -13137,6 +13403,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
13137
13403
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
13138
13404
|
const logFormatSource = target.log_format;
|
|
13139
13405
|
const systemPromptSource = target.system_prompt;
|
|
13406
|
+
const streamLogResult = resolveStreamLog(target);
|
|
13407
|
+
if (streamLogResult.deprecationWarning) {
|
|
13408
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
13409
|
+
`);
|
|
13410
|
+
}
|
|
13140
13411
|
const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
|
|
13141
13412
|
allowLiteral: true,
|
|
13142
13413
|
optionalEnv: true
|
|
@@ -13207,6 +13478,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
|
|
|
13207
13478
|
timeoutMs,
|
|
13208
13479
|
logDir,
|
|
13209
13480
|
logFormat,
|
|
13481
|
+
streamLog: streamLogResult.streamLog,
|
|
13210
13482
|
systemPrompt
|
|
13211
13483
|
};
|
|
13212
13484
|
}
|
|
@@ -13218,6 +13490,11 @@ function resolveClaudeConfig(target, env, evalFilePath) {
|
|
|
13218
13490
|
const logDirSource = target.log_dir ?? target.log_directory;
|
|
13219
13491
|
const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
|
|
13220
13492
|
const systemPromptSource = target.system_prompt;
|
|
13493
|
+
const streamLogResult = resolveStreamLog(target);
|
|
13494
|
+
if (streamLogResult.deprecationWarning) {
|
|
13495
|
+
process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
|
|
13496
|
+
`);
|
|
13497
|
+
}
|
|
13221
13498
|
const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
|
|
13222
13499
|
allowLiteral: true,
|
|
13223
13500
|
optionalEnv: true
|
|
@@ -13261,7 +13538,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
|
|
|
13261
13538
|
maxTurns,
|
|
13262
13539
|
maxBudgetUsd,
|
|
13263
13540
|
logDir,
|
|
13264
|
-
logFormat
|
|
13541
|
+
logFormat,
|
|
13542
|
+
streamLog: streamLogResult.streamLog
|
|
13265
13543
|
};
|
|
13266
13544
|
}
|
|
13267
13545
|
function normalizeClaudeLogFormat(value) {
|
|
@@ -17946,100 +18224,35 @@ var LatencyEvaluator = class {
|
|
|
17946
18224
|
|
|
17947
18225
|
// src/evaluation/evaluators/skill-trigger.ts
|
|
17948
18226
|
init_cjs_shims();
|
|
17949
|
-
var CLAUDE_MATCHER = {
|
|
17950
|
-
skillTools: ["Skill"],
|
|
17951
|
-
skillInputField: "skill",
|
|
17952
|
-
readTools: ["Read"],
|
|
17953
|
-
readInputField: "file_path"
|
|
17954
|
-
};
|
|
17955
|
-
var COPILOT_MATCHER = {
|
|
17956
|
-
skillTools: ["Skill", "skill"],
|
|
17957
|
-
skillInputField: "skill",
|
|
17958
|
-
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
17959
|
-
readInputField: "file_path",
|
|
17960
|
-
skillToolPrefixes: ["Using skill: "],
|
|
17961
|
-
readToolPrefixes: ["Viewing "],
|
|
17962
|
-
readInputFields: ["file_path", "path"]
|
|
17963
|
-
};
|
|
17964
|
-
var PI_CODING_AGENT_MATCHER = {
|
|
17965
|
-
skillTools: [],
|
|
17966
|
-
skillInputField: "skill",
|
|
17967
|
-
readTools: ["read"],
|
|
17968
|
-
readInputField: "path",
|
|
17969
|
-
readInputFields: ["path", "file_path", "filePath"]
|
|
17970
|
-
};
|
|
17971
|
-
var CODEX_MATCHER = {
|
|
17972
|
-
skillTools: [],
|
|
17973
|
-
skillInputField: "skill",
|
|
17974
|
-
readTools: ["command_execution"],
|
|
17975
|
-
readInputField: "command",
|
|
17976
|
-
skillToolPrefixes: ["mcp:"],
|
|
17977
|
-
readToolPrefixes: ["mcp:"],
|
|
17978
|
-
readInputFields: ["command", "path", "file_path", "filePath"]
|
|
17979
|
-
};
|
|
17980
|
-
var PROVIDER_TOOL_SEMANTICS = {
|
|
17981
|
-
claude: CLAUDE_MATCHER,
|
|
17982
|
-
"claude-cli": CLAUDE_MATCHER,
|
|
17983
|
-
"claude-sdk": CLAUDE_MATCHER,
|
|
17984
|
-
codex: CODEX_MATCHER,
|
|
17985
|
-
"pi-coding-agent": PI_CODING_AGENT_MATCHER,
|
|
17986
|
-
"pi-cli": PI_CODING_AGENT_MATCHER,
|
|
17987
|
-
"copilot-cli": COPILOT_MATCHER,
|
|
17988
|
-
"copilot-log": COPILOT_MATCHER,
|
|
17989
|
-
"copilot-sdk": COPILOT_MATCHER,
|
|
17990
|
-
vscode: COPILOT_MATCHER,
|
|
17991
|
-
"vscode-insiders": COPILOT_MATCHER
|
|
17992
|
-
};
|
|
17993
18227
|
var SkillTriggerEvaluator = class {
|
|
17994
18228
|
kind = "skill-trigger";
|
|
17995
18229
|
config;
|
|
17996
18230
|
constructor(config) {
|
|
17997
18231
|
this.config = config;
|
|
17998
18232
|
}
|
|
17999
|
-
resolveMatcher(providerKind) {
|
|
18000
|
-
if (providerKind) {
|
|
18001
|
-
const match = PROVIDER_TOOL_SEMANTICS[providerKind];
|
|
18002
|
-
if (match) return match;
|
|
18003
|
-
}
|
|
18004
|
-
return CLAUDE_MATCHER;
|
|
18005
|
-
}
|
|
18006
18233
|
evaluate(context2) {
|
|
18007
18234
|
const skillName = this.config.skill;
|
|
18008
18235
|
const shouldTrigger = this.config.should_trigger !== false;
|
|
18009
|
-
const providerKind = context2.provider?.kind;
|
|
18010
|
-
const matcher = this.resolveMatcher(providerKind);
|
|
18011
18236
|
const allToolCalls = (context2.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
|
|
18012
18237
|
let triggered = false;
|
|
18013
18238
|
let evidence = "";
|
|
18014
18239
|
for (const toolCall of allToolCalls) {
|
|
18015
18240
|
const toolName = toolCall.tool ?? "";
|
|
18016
18241
|
const input = toolCall.input ?? {};
|
|
18017
|
-
if (
|
|
18018
|
-
const skillArg = String(input
|
|
18242
|
+
if (toolName === "Skill") {
|
|
18243
|
+
const skillArg = String(input.skill ?? "");
|
|
18019
18244
|
if (skillArg.includes(skillName)) {
|
|
18020
18245
|
triggered = true;
|
|
18021
|
-
evidence = `Skill tool invoked with
|
|
18246
|
+
evidence = `Skill tool invoked with skill="${skillArg}"`;
|
|
18022
18247
|
break;
|
|
18023
18248
|
}
|
|
18024
|
-
} else if (
|
|
18025
|
-
|
|
18026
|
-
|
|
18027
|
-
triggered = true;
|
|
18028
|
-
evidence = `Skill tool invoked via tool name "${toolName}"`;
|
|
18029
|
-
break;
|
|
18030
|
-
} else if (matcher.readTools.includes(toolName)) {
|
|
18031
|
-
const filePath = this.readPathFromInput(input, matcher);
|
|
18032
|
-
if (filePath.includes(skillName)) {
|
|
18249
|
+
} else if (toolName === "Read") {
|
|
18250
|
+
const filePath = String(input.file_path ?? "");
|
|
18251
|
+
if (filePath.includes(`skills/${skillName}/`)) {
|
|
18033
18252
|
triggered = true;
|
|
18034
18253
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
18035
18254
|
break;
|
|
18036
18255
|
}
|
|
18037
|
-
} else if (matcher.readToolPrefixes?.some(
|
|
18038
|
-
(prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
|
|
18039
|
-
)) {
|
|
18040
|
-
triggered = true;
|
|
18041
|
-
evidence = `Read tool loaded skill file via tool name "${toolName}"`;
|
|
18042
|
-
break;
|
|
18043
18256
|
}
|
|
18044
18257
|
if (!triggered && toolCall.output != null) {
|
|
18045
18258
|
const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
|
|
@@ -18076,16 +18289,6 @@ var SkillTriggerEvaluator = class {
|
|
|
18076
18289
|
expectedAspectCount: 1
|
|
18077
18290
|
};
|
|
18078
18291
|
}
|
|
18079
|
-
readPathFromInput(input, matcher) {
|
|
18080
|
-
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
18081
|
-
for (const field of fields) {
|
|
18082
|
-
const value = input[field];
|
|
18083
|
-
if (value !== void 0 && value !== null) {
|
|
18084
|
-
return String(value);
|
|
18085
|
-
}
|
|
18086
|
-
}
|
|
18087
|
-
return "";
|
|
18088
|
-
}
|
|
18089
18292
|
};
|
|
18090
18293
|
|
|
18091
18294
|
// src/evaluation/evaluators/llm-grader-prompt.ts
|
|
@@ -18935,10 +19138,12 @@ function runEqualsAssertion(output, value) {
|
|
|
18935
19138
|
|
|
18936
19139
|
// src/evaluation/orchestrator.ts
|
|
18937
19140
|
init_cjs_shims();
|
|
19141
|
+
var import_node_child_process11 = require("child_process");
|
|
18938
19142
|
var import_node_crypto11 = require("crypto");
|
|
18939
19143
|
var import_node_fs16 = require("fs");
|
|
18940
19144
|
var import_promises36 = require("fs/promises");
|
|
18941
19145
|
var import_node_path49 = __toESM(require("path"), 1);
|
|
19146
|
+
var import_node_util7 = require("util");
|
|
18942
19147
|
var import_micromatch3 = __toESM(require("micromatch"), 1);
|
|
18943
19148
|
|
|
18944
19149
|
// ../../node_modules/.bun/p-limit@6.2.0/node_modules/p-limit/index.js
|
|
@@ -20414,6 +20619,8 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
20414
20619
|
}
|
|
20415
20620
|
|
|
20416
20621
|
// src/evaluation/orchestrator.ts
|
|
20622
|
+
var execFileAsync3 = (0, import_node_util7.promisify)(import_node_child_process11.execFile);
|
|
20623
|
+
var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
|
|
20417
20624
|
function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
|
|
20418
20625
|
return score >= threshold ? "ok" : "quality_failure";
|
|
20419
20626
|
}
|
|
@@ -20451,6 +20658,35 @@ function hasHookCommand(hook) {
|
|
|
20451
20658
|
function hooksEnabled(workspace) {
|
|
20452
20659
|
return workspace?.hooks?.enabled !== false;
|
|
20453
20660
|
}
|
|
20661
|
+
function workspaceGitEnv() {
|
|
20662
|
+
const env = { ...process.env };
|
|
20663
|
+
for (const key of Object.keys(env)) {
|
|
20664
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
20665
|
+
delete env[key];
|
|
20666
|
+
}
|
|
20667
|
+
}
|
|
20668
|
+
return {
|
|
20669
|
+
...env,
|
|
20670
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
20671
|
+
GIT_ASKPASS: "",
|
|
20672
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
20673
|
+
};
|
|
20674
|
+
}
|
|
20675
|
+
async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
|
|
20676
|
+
if (!(0, import_node_fs16.existsSync)(import_node_path49.default.join(workspacePath, ".git"))) {
|
|
20677
|
+
return false;
|
|
20678
|
+
}
|
|
20679
|
+
const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
|
|
20680
|
+
const opts = {
|
|
20681
|
+
cwd: workspacePath,
|
|
20682
|
+
timeout: WORKSPACE_GIT_TIMEOUT_MS,
|
|
20683
|
+
env: workspaceGitEnv(),
|
|
20684
|
+
maxBuffer: 50 * 1024 * 1024
|
|
20685
|
+
};
|
|
20686
|
+
await execFileAsync3("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
|
|
20687
|
+
await execFileAsync3("git", ["clean", cleanFlag], opts);
|
|
20688
|
+
return true;
|
|
20689
|
+
}
|
|
20454
20690
|
function getWorkspaceTemplate(target) {
|
|
20455
20691
|
const config = target.config;
|
|
20456
20692
|
if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
|
|
@@ -21712,6 +21948,37 @@ async function runEvalCase(options) {
|
|
|
21712
21948
|
}
|
|
21713
21949
|
}
|
|
21714
21950
|
}
|
|
21951
|
+
let beforeEachNeedsFreshBaseline = false;
|
|
21952
|
+
if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
|
|
21953
|
+
try {
|
|
21954
|
+
if (repoManager && evalCase.workspace.repos?.length) {
|
|
21955
|
+
await repoManager.reset(
|
|
21956
|
+
evalCase.workspace.repos,
|
|
21957
|
+
workspacePath,
|
|
21958
|
+
evalCase.workspace.hooks.before_each.reset
|
|
21959
|
+
);
|
|
21960
|
+
} else {
|
|
21961
|
+
await resetWorkspaceRoot(
|
|
21962
|
+
workspacePath,
|
|
21963
|
+
evalCase.workspace.hooks.before_each.reset,
|
|
21964
|
+
sharedBaselineCommit
|
|
21965
|
+
);
|
|
21966
|
+
}
|
|
21967
|
+
} catch (error) {
|
|
21968
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
21969
|
+
return buildErrorResult(
|
|
21970
|
+
evalCase,
|
|
21971
|
+
target.name,
|
|
21972
|
+
nowFn(),
|
|
21973
|
+
new Error(`before_each reset failed: ${message}`),
|
|
21974
|
+
promptInputs,
|
|
21975
|
+
provider,
|
|
21976
|
+
"setup",
|
|
21977
|
+
"script_error",
|
|
21978
|
+
verbose
|
|
21979
|
+
);
|
|
21980
|
+
}
|
|
21981
|
+
}
|
|
21715
21982
|
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
|
|
21716
21983
|
if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
|
|
21717
21984
|
const beforeEachHook = caseBeforeEachHook;
|
|
@@ -21728,6 +21995,7 @@ async function runEvalCase(options) {
|
|
|
21728
21995
|
toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
|
|
21729
21996
|
scriptContext
|
|
21730
21997
|
);
|
|
21998
|
+
beforeEachNeedsFreshBaseline = true;
|
|
21731
21999
|
} catch (error) {
|
|
21732
22000
|
const message = error instanceof Error ? error.message : String(error);
|
|
21733
22001
|
return buildErrorResult(
|
|
@@ -21743,7 +22011,7 @@ async function runEvalCase(options) {
|
|
|
21743
22011
|
);
|
|
21744
22012
|
}
|
|
21745
22013
|
}
|
|
21746
|
-
let baselineCommit = sharedBaselineCommit;
|
|
22014
|
+
let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
|
|
21747
22015
|
if (!baselineCommit && workspacePath) {
|
|
21748
22016
|
try {
|
|
21749
22017
|
baselineCommit = await initializeBaseline(workspacePath);
|
|
@@ -21754,6 +22022,35 @@ async function runEvalCase(options) {
|
|
|
21754
22022
|
}
|
|
21755
22023
|
}
|
|
21756
22024
|
}
|
|
22025
|
+
if (evalCase.mode === "conversation" && evalCase.turns?.length) {
|
|
22026
|
+
const conversationResult = await runConversationMode({
|
|
22027
|
+
evalCase,
|
|
22028
|
+
provider,
|
|
22029
|
+
target,
|
|
22030
|
+
evaluators,
|
|
22031
|
+
typeRegistry,
|
|
22032
|
+
graderProvider,
|
|
22033
|
+
promptInputs,
|
|
22034
|
+
nowFn,
|
|
22035
|
+
signal,
|
|
22036
|
+
workspacePath,
|
|
22037
|
+
caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
|
|
22038
|
+
agentTimeoutMs,
|
|
22039
|
+
streamCallbacks: options.streamCallbacks,
|
|
22040
|
+
verbose,
|
|
22041
|
+
threshold: evalCase.threshold ?? caseThreshold,
|
|
22042
|
+
targetResolver,
|
|
22043
|
+
availableTargets
|
|
22044
|
+
});
|
|
22045
|
+
if (workspacePath && !isSharedWorkspace) {
|
|
22046
|
+
const shouldRetain = conversationResult.executionStatus === "ok" ? retainOnSuccess === "keep" || keepWorkspaces : retainOnFailure === "keep" || !forceCleanup && !keepWorkspaces;
|
|
22047
|
+
if (!shouldRetain) {
|
|
22048
|
+
await cleanupWorkspace(workspacePath).catch(() => {
|
|
22049
|
+
});
|
|
22050
|
+
}
|
|
22051
|
+
}
|
|
22052
|
+
return conversationResult;
|
|
22053
|
+
}
|
|
21757
22054
|
const caseStartMs = Date.now();
|
|
21758
22055
|
const attemptBudget = (maxRetries ?? 0) + 1;
|
|
21759
22056
|
let attempt = 0;
|
|
@@ -21868,13 +22165,21 @@ async function runEvalCase(options) {
|
|
|
21868
22165
|
${providerFileChanges}` : providerFileChanges;
|
|
21869
22166
|
}
|
|
21870
22167
|
const providerError = extractProviderError(providerResponse);
|
|
21871
|
-
if (caseHooksEnabled &&
|
|
22168
|
+
if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
|
|
21872
22169
|
try {
|
|
21873
|
-
|
|
21874
|
-
|
|
21875
|
-
|
|
21876
|
-
|
|
21877
|
-
|
|
22170
|
+
if (repoManager && evalCase.workspace.repos?.length) {
|
|
22171
|
+
await repoManager.reset(
|
|
22172
|
+
evalCase.workspace.repos,
|
|
22173
|
+
workspacePath,
|
|
22174
|
+
evalCase.workspace.hooks.after_each.reset
|
|
22175
|
+
);
|
|
22176
|
+
} else {
|
|
22177
|
+
await resetWorkspaceRoot(
|
|
22178
|
+
workspacePath,
|
|
22179
|
+
evalCase.workspace.hooks.after_each.reset,
|
|
22180
|
+
baselineCommit
|
|
22181
|
+
);
|
|
22182
|
+
}
|
|
21878
22183
|
} catch {
|
|
21879
22184
|
}
|
|
21880
22185
|
}
|
|
@@ -22490,6 +22795,276 @@ function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
|
22490
22795
|
"llm-grader": llmGrader
|
|
22491
22796
|
};
|
|
22492
22797
|
}
|
|
22798
|
+
async function runConversationMode(options) {
|
|
22799
|
+
const {
|
|
22800
|
+
evalCase,
|
|
22801
|
+
provider,
|
|
22802
|
+
target,
|
|
22803
|
+
evaluators,
|
|
22804
|
+
typeRegistry,
|
|
22805
|
+
graderProvider,
|
|
22806
|
+
promptInputs,
|
|
22807
|
+
nowFn,
|
|
22808
|
+
signal,
|
|
22809
|
+
workspacePath,
|
|
22810
|
+
caseWorkspaceFile,
|
|
22811
|
+
agentTimeoutMs,
|
|
22812
|
+
streamCallbacks,
|
|
22813
|
+
verbose,
|
|
22814
|
+
threshold,
|
|
22815
|
+
targetResolver,
|
|
22816
|
+
availableTargets
|
|
22817
|
+
} = options;
|
|
22818
|
+
const turns = evalCase.turns;
|
|
22819
|
+
const aggregation = evalCase.aggregation ?? "mean";
|
|
22820
|
+
const onTurnFailure = evalCase.on_turn_failure ?? "continue";
|
|
22821
|
+
const windowSize = evalCase.window_size;
|
|
22822
|
+
const history = [];
|
|
22823
|
+
for (const msg of evalCase.input) {
|
|
22824
|
+
const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
|
|
22825
|
+
history.push({ role: msg.role, content });
|
|
22826
|
+
}
|
|
22827
|
+
const turnScores = [];
|
|
22828
|
+
const allTurnScoreValues = [];
|
|
22829
|
+
let stopped = false;
|
|
22830
|
+
const caseStartMs = Date.now();
|
|
22831
|
+
for (let i = 0; i < turns.length; i++) {
|
|
22832
|
+
const turn = turns[i];
|
|
22833
|
+
const turnIndex = i + 1;
|
|
22834
|
+
if (stopped) {
|
|
22835
|
+
turnScores.push({
|
|
22836
|
+
name: `turn-${turnIndex}`,
|
|
22837
|
+
type: "rubrics",
|
|
22838
|
+
score: 0,
|
|
22839
|
+
verdict: "skip",
|
|
22840
|
+
assertions: [{ text: "Skipped due to previous turn failure", passed: false }]
|
|
22841
|
+
});
|
|
22842
|
+
allTurnScoreValues.push(0);
|
|
22843
|
+
continue;
|
|
22844
|
+
}
|
|
22845
|
+
const userContent = typeof turn.input === "string" ? turn.input : JSON.stringify(turn.input);
|
|
22846
|
+
history.push({ role: "user", content: userContent });
|
|
22847
|
+
const chatPromptForProvider = windowSize ? buildWindowedHistory(history, windowSize) : [...history];
|
|
22848
|
+
let response;
|
|
22849
|
+
try {
|
|
22850
|
+
response = await provider.invoke({
|
|
22851
|
+
question: userContent,
|
|
22852
|
+
chatPrompt: chatPromptForProvider,
|
|
22853
|
+
evalCaseId: `${evalCase.id}/turn-${turnIndex}`,
|
|
22854
|
+
signal,
|
|
22855
|
+
cwd: workspacePath,
|
|
22856
|
+
workspaceFile: caseWorkspaceFile,
|
|
22857
|
+
streamCallbacks
|
|
22858
|
+
});
|
|
22859
|
+
} catch (error) {
|
|
22860
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
22861
|
+
turnScores.push({
|
|
22862
|
+
name: `turn-${turnIndex}`,
|
|
22863
|
+
type: "rubrics",
|
|
22864
|
+
score: 0,
|
|
22865
|
+
verdict: "fail",
|
|
22866
|
+
assertions: [{ text: `Provider error: ${message}`, passed: false }]
|
|
22867
|
+
});
|
|
22868
|
+
allTurnScoreValues.push(0);
|
|
22869
|
+
if (onTurnFailure === "stop") stopped = true;
|
|
22870
|
+
continue;
|
|
22871
|
+
}
|
|
22872
|
+
const assistantContent = extractLastAssistantContent(response.output);
|
|
22873
|
+
history.push({ role: "assistant", content: assistantContent });
|
|
22874
|
+
if (!turn.assertions?.length && !turn.expected_output) {
|
|
22875
|
+
turnScores.push({
|
|
22876
|
+
name: `turn-${turnIndex}`,
|
|
22877
|
+
type: "rubrics",
|
|
22878
|
+
score: 1,
|
|
22879
|
+
verdict: "pass",
|
|
22880
|
+
assertions: []
|
|
22881
|
+
});
|
|
22882
|
+
allTurnScoreValues.push(1);
|
|
22883
|
+
continue;
|
|
22884
|
+
}
|
|
22885
|
+
const turnAssertions = buildTurnAssertions(turn);
|
|
22886
|
+
const turnEvalCase = {
|
|
22887
|
+
...evalCase,
|
|
22888
|
+
id: `${evalCase.id}/turn-${turnIndex}`,
|
|
22889
|
+
assertions: turnAssertions,
|
|
22890
|
+
input: buildTurnGraderInput(history, windowSize),
|
|
22891
|
+
expected_output: turn.expected_output ? [
|
|
22892
|
+
typeof turn.expected_output === "string" ? { content: turn.expected_output } : turn.expected_output
|
|
22893
|
+
] : [],
|
|
22894
|
+
// Clear conversation fields to prevent recursion
|
|
22895
|
+
mode: void 0,
|
|
22896
|
+
turns: void 0
|
|
22897
|
+
};
|
|
22898
|
+
const turnResult = await evaluateCandidate({
|
|
22899
|
+
evalCase: turnEvalCase,
|
|
22900
|
+
candidate: assistantContent,
|
|
22901
|
+
target,
|
|
22902
|
+
provider,
|
|
22903
|
+
evaluators,
|
|
22904
|
+
typeRegistry,
|
|
22905
|
+
promptInputs: {
|
|
22906
|
+
question: buildConversationContext(history, windowSize),
|
|
22907
|
+
chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history]
|
|
22908
|
+
},
|
|
22909
|
+
nowFn,
|
|
22910
|
+
attempt: 0,
|
|
22911
|
+
graderProvider,
|
|
22912
|
+
agentTimeoutMs,
|
|
22913
|
+
output: response.output,
|
|
22914
|
+
verbose,
|
|
22915
|
+
threshold,
|
|
22916
|
+
targetResolver,
|
|
22917
|
+
availableTargets
|
|
22918
|
+
});
|
|
22919
|
+
const turnScore = turnResult.score;
|
|
22920
|
+
allTurnScoreValues.push(turnScore);
|
|
22921
|
+
turnScores.push({
|
|
22922
|
+
name: `turn-${turnIndex}`,
|
|
22923
|
+
type: "rubrics",
|
|
22924
|
+
score: turnScore,
|
|
22925
|
+
verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD),
|
|
22926
|
+
assertions: turnResult.assertions ? [...turnResult.assertions] : [],
|
|
22927
|
+
scores: turnResult.scores
|
|
22928
|
+
});
|
|
22929
|
+
if (onTurnFailure === "stop" && turnScore < (threshold ?? DEFAULT_THRESHOLD)) {
|
|
22930
|
+
stopped = true;
|
|
22931
|
+
}
|
|
22932
|
+
}
|
|
22933
|
+
let conversationScores = [];
|
|
22934
|
+
if (evalCase.assertions?.length) {
|
|
22935
|
+
const conversationEvalCase = {
|
|
22936
|
+
...evalCase,
|
|
22937
|
+
id: `${evalCase.id}/conversation`,
|
|
22938
|
+
input: history.map((m) => ({
|
|
22939
|
+
role: m.role,
|
|
22940
|
+
content: m.content
|
|
22941
|
+
})),
|
|
22942
|
+
expected_output: [],
|
|
22943
|
+
mode: void 0,
|
|
22944
|
+
turns: void 0
|
|
22945
|
+
};
|
|
22946
|
+
const fullTranscript = history.map((m) => {
|
|
22947
|
+
const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
|
|
22948
|
+
return `${m.role}: ${content}`;
|
|
22949
|
+
}).join("\n\n");
|
|
22950
|
+
const conversationResult = await evaluateCandidate({
|
|
22951
|
+
evalCase: conversationEvalCase,
|
|
22952
|
+
candidate: fullTranscript,
|
|
22953
|
+
target,
|
|
22954
|
+
provider,
|
|
22955
|
+
evaluators,
|
|
22956
|
+
typeRegistry,
|
|
22957
|
+
promptInputs: {
|
|
22958
|
+
question: fullTranscript,
|
|
22959
|
+
chatPrompt: [...history]
|
|
22960
|
+
},
|
|
22961
|
+
nowFn,
|
|
22962
|
+
attempt: 0,
|
|
22963
|
+
graderProvider,
|
|
22964
|
+
agentTimeoutMs,
|
|
22965
|
+
verbose,
|
|
22966
|
+
threshold,
|
|
22967
|
+
targetResolver,
|
|
22968
|
+
availableTargets
|
|
22969
|
+
});
|
|
22970
|
+
conversationScores = [
|
|
22971
|
+
{
|
|
22972
|
+
name: "conversation",
|
|
22973
|
+
type: "rubrics",
|
|
22974
|
+
score: conversationResult.score,
|
|
22975
|
+
verdict: scoreToVerdict(
|
|
22976
|
+
conversationResult.score,
|
|
22977
|
+
threshold ?? DEFAULT_THRESHOLD
|
|
22978
|
+
),
|
|
22979
|
+
assertions: conversationResult.assertions ? [...conversationResult.assertions] : [],
|
|
22980
|
+
scores: conversationResult.scores
|
|
22981
|
+
}
|
|
22982
|
+
];
|
|
22983
|
+
}
|
|
22984
|
+
const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)];
|
|
22985
|
+
const finalScore = aggregateConversationScores(allScoreValues, aggregation);
|
|
22986
|
+
const allResultScores = [...turnScores, ...conversationScores];
|
|
22987
|
+
const outputMessages = history.map((m) => ({
|
|
22988
|
+
role: m.role,
|
|
22989
|
+
content: m.content
|
|
22990
|
+
}));
|
|
22991
|
+
const flatAssertions = allResultScores.flatMap((s) => [...s.assertions]);
|
|
22992
|
+
const totalDurationMs = Date.now() - caseStartMs;
|
|
22993
|
+
return {
|
|
22994
|
+
timestamp: nowFn().toISOString(),
|
|
22995
|
+
testId: evalCase.id,
|
|
22996
|
+
suite: evalCase.suite,
|
|
22997
|
+
category: evalCase.category,
|
|
22998
|
+
score: finalScore,
|
|
22999
|
+
assertions: flatAssertions,
|
|
23000
|
+
target: target.name,
|
|
23001
|
+
output: outputMessages,
|
|
23002
|
+
scores: allResultScores,
|
|
23003
|
+
executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
|
|
23004
|
+
input: evalCase.input.map((m) => ({
|
|
23005
|
+
role: m.role,
|
|
23006
|
+
content: typeof m.content === "string" ? m.content : JSON.stringify(m.content)
|
|
23007
|
+
})),
|
|
23008
|
+
evalRun: { durationMs: totalDurationMs }
|
|
23009
|
+
};
|
|
23010
|
+
}
|
|
23011
|
+
function buildWindowedHistory(history, windowSize) {
|
|
23012
|
+
const systemMessages = history.filter((m) => m.role === "system");
|
|
23013
|
+
const nonSystem = history.filter((m) => m.role !== "system");
|
|
23014
|
+
const windowed = nonSystem.slice(-windowSize * 2);
|
|
23015
|
+
return [...systemMessages, ...windowed];
|
|
23016
|
+
}
|
|
23017
|
+
function buildConversationContext(history, windowSize) {
|
|
23018
|
+
const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
|
|
23019
|
+
return msgs.map((m) => {
|
|
23020
|
+
const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
|
|
23021
|
+
return `${m.role}: ${content}`;
|
|
23022
|
+
}).join("\n\n");
|
|
23023
|
+
}
|
|
23024
|
+
function buildTurnGraderInput(history, windowSize) {
|
|
23025
|
+
const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
|
|
23026
|
+
return msgs.map((m) => ({
|
|
23027
|
+
role: m.role,
|
|
23028
|
+
content: m.content
|
|
23029
|
+
}));
|
|
23030
|
+
}
|
|
23031
|
+
function buildTurnAssertions(turn) {
|
|
23032
|
+
if (!turn.assertions?.length) return [];
|
|
23033
|
+
const stringCriteria = [];
|
|
23034
|
+
const structured = [];
|
|
23035
|
+
for (const a of turn.assertions) {
|
|
23036
|
+
if (typeof a === "string") {
|
|
23037
|
+
stringCriteria.push(a);
|
|
23038
|
+
} else {
|
|
23039
|
+
structured.push(a);
|
|
23040
|
+
}
|
|
23041
|
+
}
|
|
23042
|
+
const result = [];
|
|
23043
|
+
if (stringCriteria.length > 0) {
|
|
23044
|
+
result.push({
|
|
23045
|
+
name: "turn-rubrics",
|
|
23046
|
+
type: "llm-grader",
|
|
23047
|
+
rubrics: stringCriteria.map((text, idx) => ({
|
|
23048
|
+
id: `criterion-${idx + 1}`,
|
|
23049
|
+
outcome: text,
|
|
23050
|
+
weight: 1
|
|
23051
|
+
}))
|
|
23052
|
+
});
|
|
23053
|
+
}
|
|
23054
|
+
result.push(...structured);
|
|
23055
|
+
return result;
|
|
23056
|
+
}
|
|
23057
|
+
function aggregateConversationScores(scores, aggregation) {
|
|
23058
|
+
if (scores.length === 0) return 1;
|
|
23059
|
+
switch (aggregation) {
|
|
23060
|
+
case "min":
|
|
23061
|
+
return Math.min(...scores);
|
|
23062
|
+
case "max":
|
|
23063
|
+
return Math.max(...scores);
|
|
23064
|
+
default:
|
|
23065
|
+
return scores.reduce((sum, s) => sum + s, 0) / scores.length;
|
|
23066
|
+
}
|
|
23067
|
+
}
|
|
22493
23068
|
async function invokeProvider(provider, options) {
|
|
22494
23069
|
const {
|
|
22495
23070
|
evalCase,
|
|
@@ -23222,13 +23797,13 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
23222
23797
|
|
|
23223
23798
|
// src/evaluation/results-repo.ts
|
|
23224
23799
|
init_cjs_shims();
|
|
23225
|
-
var
|
|
23800
|
+
var import_node_child_process12 = require("child_process");
|
|
23226
23801
|
var import_node_fs18 = require("fs");
|
|
23227
23802
|
var import_promises39 = require("fs/promises");
|
|
23228
23803
|
var import_node_os9 = __toESM(require("os"), 1);
|
|
23229
23804
|
var import_node_path53 = __toESM(require("path"), 1);
|
|
23230
|
-
var
|
|
23231
|
-
var
|
|
23805
|
+
var import_node_util8 = require("util");
|
|
23806
|
+
var execFileAsync4 = (0, import_node_util8.promisify)(import_node_child_process12.execFile);
|
|
23232
23807
|
function sanitizeRepoSlug(repo) {
|
|
23233
23808
|
return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
|
|
23234
23809
|
}
|
|
@@ -23279,7 +23854,7 @@ function writePersistedStatus(statusFile, status) {
|
|
|
23279
23854
|
}
|
|
23280
23855
|
async function runCommand(executable, args, options) {
|
|
23281
23856
|
try {
|
|
23282
|
-
const { stdout, stderr } = await
|
|
23857
|
+
const { stdout, stderr } = await execFileAsync4(executable, [...args], {
|
|
23283
23858
|
cwd: options?.cwd,
|
|
23284
23859
|
env: process.env
|
|
23285
23860
|
});
|
|
@@ -24341,11 +24916,13 @@ function extractAssistantContent(content) {
|
|
|
24341
24916
|
break;
|
|
24342
24917
|
case "tool_use":
|
|
24343
24918
|
if (block.name) {
|
|
24344
|
-
toolCalls.push(
|
|
24345
|
-
|
|
24346
|
-
|
|
24347
|
-
|
|
24348
|
-
|
|
24919
|
+
toolCalls.push(
|
|
24920
|
+
normalizeToolCall("claude", {
|
|
24921
|
+
tool: block.name,
|
|
24922
|
+
input: block.input,
|
|
24923
|
+
id: block.id
|
|
24924
|
+
})
|
|
24925
|
+
);
|
|
24349
24926
|
}
|
|
24350
24927
|
break;
|
|
24351
24928
|
}
|
|
@@ -24438,7 +25015,11 @@ function parseCodexSession(jsonl) {
|
|
|
24438
25015
|
} else {
|
|
24439
25016
|
input = payload.arguments;
|
|
24440
25017
|
}
|
|
24441
|
-
const toolCall =
|
|
25018
|
+
const toolCall = normalizeToolCall("codex", {
|
|
25019
|
+
tool: toolName,
|
|
25020
|
+
input,
|
|
25021
|
+
id: callId
|
|
25022
|
+
});
|
|
24442
25023
|
const msgIdx = messages.length;
|
|
24443
25024
|
messages.push({
|
|
24444
25025
|
role: "assistant",
|
|
@@ -24462,7 +25043,11 @@ function parseCodexSession(jsonl) {
|
|
|
24462
25043
|
} else {
|
|
24463
25044
|
input = payload.arguments;
|
|
24464
25045
|
}
|
|
24465
|
-
const toolCall =
|
|
25046
|
+
const toolCall = normalizeToolCall("codex", {
|
|
25047
|
+
tool: toolName,
|
|
25048
|
+
input,
|
|
25049
|
+
id: callId
|
|
25050
|
+
});
|
|
24466
25051
|
const msgIdx = messages.length;
|
|
24467
25052
|
messages.push({
|
|
24468
25053
|
role: "assistant",
|