@agentv/core 4.14.0 → 4.15.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-A3HYVKTI.js → chunk-AOOU6PLC.js} +70 -2
- package/dist/chunk-AOOU6PLC.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +89 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +90 -12
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +774 -189
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +68 -14
- package/dist/index.d.ts +68 -14
- package/dist/index.js +705 -189
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-A3HYVKTI.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -25,7 +25,7 @@ import {
|
|
|
25
25
|
resolveDelegatedTargetDefinition,
|
|
26
26
|
resolveFileReference,
|
|
27
27
|
resolveTargetDefinition
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-AOOU6PLC.js";
|
|
29
29
|
import {
|
|
30
30
|
execFileWithStdin,
|
|
31
31
|
execShellWithStdin
|
|
@@ -3673,10 +3673,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3673
3673
|
const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
|
|
3674
3674
|
const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
|
|
3675
3675
|
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
3676
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
|
|
3676
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0 || Array.isArray(testCaseConfig.turns) && testCaseConfig.turns.length > 0;
|
|
3677
3677
|
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
3678
3678
|
logError3(
|
|
3679
|
-
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
|
|
3679
|
+
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions/turns`
|
|
3680
3680
|
);
|
|
3681
3681
|
continue;
|
|
3682
3682
|
}
|
|
@@ -3753,6 +3753,14 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3753
3753
|
) : void 0;
|
|
3754
3754
|
const onDependencyFailureRaw = asString5(testCaseConfig.on_dependency_failure);
|
|
3755
3755
|
const onDependencyFailure = onDependencyFailureRaw === "skip" || onDependencyFailureRaw === "fail" || onDependencyFailureRaw === "run" ? onDependencyFailureRaw : void 0;
|
|
3756
|
+
const modeRaw = asString5(testCaseConfig.mode);
|
|
3757
|
+
const mode = modeRaw === "conversation" ? "conversation" : void 0;
|
|
3758
|
+
const turns = Array.isArray(testCaseConfig.turns) ? parseTurns(testCaseConfig.turns) : void 0;
|
|
3759
|
+
const aggregationRaw = asString5(testCaseConfig.aggregation);
|
|
3760
|
+
const aggregation = aggregationRaw === "mean" || aggregationRaw === "min" || aggregationRaw === "max" ? aggregationRaw : void 0;
|
|
3761
|
+
const onTurnFailureRaw = asString5(testCaseConfig.on_turn_failure);
|
|
3762
|
+
const onTurnFailure = onTurnFailureRaw === "continue" || onTurnFailureRaw === "stop" ? onTurnFailureRaw : void 0;
|
|
3763
|
+
const windowSize = typeof testCaseConfig.window_size === "number" && testCaseConfig.window_size >= 1 ? testCaseConfig.window_size : void 0;
|
|
3756
3764
|
const testCase = {
|
|
3757
3765
|
id,
|
|
3758
3766
|
suite: suiteName,
|
|
@@ -3771,6 +3779,11 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3771
3779
|
metadata,
|
|
3772
3780
|
targets: caseTargets,
|
|
3773
3781
|
...caseThreshold !== void 0 ? { threshold: caseThreshold } : {},
|
|
3782
|
+
...mode ? { mode } : {},
|
|
3783
|
+
...turns && turns.length > 0 ? { turns } : {},
|
|
3784
|
+
...aggregation ? { aggregation } : {},
|
|
3785
|
+
...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
|
|
3786
|
+
...windowSize !== void 0 ? { window_size: windowSize } : {},
|
|
3774
3787
|
...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
|
|
3775
3788
|
...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
|
|
3776
3789
|
};
|
|
@@ -3788,6 +3801,25 @@ async function loadTestById(evalFilePath, repoRoot, evalId) {
|
|
|
3788
3801
|
return match;
|
|
3789
3802
|
}
|
|
3790
3803
|
var loadEvalCaseById = loadTestById;
|
|
3804
|
+
function parseTurns(rawTurns) {
|
|
3805
|
+
return rawTurns.map((rawTurn) => {
|
|
3806
|
+
const turn = rawTurn;
|
|
3807
|
+
const input = turn.input;
|
|
3808
|
+
const expectedOutput = turn.expected_output;
|
|
3809
|
+
let assertions;
|
|
3810
|
+
if (Array.isArray(turn.assertions)) {
|
|
3811
|
+
assertions = turn.assertions.map((a) => {
|
|
3812
|
+
if (typeof a === "string") return a;
|
|
3813
|
+
return a;
|
|
3814
|
+
});
|
|
3815
|
+
}
|
|
3816
|
+
return {
|
|
3817
|
+
input,
|
|
3818
|
+
...expectedOutput !== void 0 ? { expected_output: expectedOutput } : {},
|
|
3819
|
+
...assertions && assertions.length > 0 ? { assertions } : {}
|
|
3820
|
+
};
|
|
3821
|
+
});
|
|
3822
|
+
}
|
|
3791
3823
|
function parseCommandArray(source) {
|
|
3792
3824
|
if (typeof source === "string") {
|
|
3793
3825
|
const parts = source.trim().split(/\s+/);
|
|
@@ -4745,6 +4777,154 @@ function subscribeToClaudeLogEntries(listener) {
|
|
|
4745
4777
|
};
|
|
4746
4778
|
}
|
|
4747
4779
|
|
|
4780
|
+
// src/evaluation/providers/normalize-tool-call.ts
|
|
4781
|
+
var TOOL_NAME_MAP = /* @__PURE__ */ new Map([
|
|
4782
|
+
// --- Claude (already canonical) ---
|
|
4783
|
+
["claude::Skill", "Skill"],
|
|
4784
|
+
["claude::Read", "Read"],
|
|
4785
|
+
["claude::Write", "Write"],
|
|
4786
|
+
["claude::Edit", "Edit"],
|
|
4787
|
+
["claude::Bash", "Bash"],
|
|
4788
|
+
["claude-cli::Skill", "Skill"],
|
|
4789
|
+
["claude-cli::Read", "Read"],
|
|
4790
|
+
["claude-cli::Write", "Write"],
|
|
4791
|
+
["claude-cli::Edit", "Edit"],
|
|
4792
|
+
["claude-cli::Bash", "Bash"],
|
|
4793
|
+
["claude-sdk::Skill", "Skill"],
|
|
4794
|
+
["claude-sdk::Read", "Read"],
|
|
4795
|
+
["claude-sdk::Write", "Write"],
|
|
4796
|
+
["claude-sdk::Edit", "Edit"],
|
|
4797
|
+
["claude-sdk::Bash", "Bash"],
|
|
4798
|
+
// --- Copilot ---
|
|
4799
|
+
["copilot-cli::Skill", "Skill"],
|
|
4800
|
+
["copilot-cli::skill", "Skill"],
|
|
4801
|
+
["copilot-cli::Read File", "Read"],
|
|
4802
|
+
["copilot-cli::readFile", "Read"],
|
|
4803
|
+
["copilot-cli::Read", "Read"],
|
|
4804
|
+
["copilot-cli::readTextFile", "Read"],
|
|
4805
|
+
["copilot-cli::writeTextFile", "Write"],
|
|
4806
|
+
["copilot-cli::Write File", "Write"],
|
|
4807
|
+
["copilot-cli::editFile", "Edit"],
|
|
4808
|
+
["copilot-cli::Edit File", "Edit"],
|
|
4809
|
+
["copilot-cli::runTerminalCommand", "Bash"],
|
|
4810
|
+
["copilot-sdk::Skill", "Skill"],
|
|
4811
|
+
["copilot-sdk::skill", "Skill"],
|
|
4812
|
+
["copilot-sdk::Read File", "Read"],
|
|
4813
|
+
["copilot-sdk::readFile", "Read"],
|
|
4814
|
+
["copilot-sdk::Read", "Read"],
|
|
4815
|
+
["copilot-sdk::readTextFile", "Read"],
|
|
4816
|
+
["copilot-sdk::writeTextFile", "Write"],
|
|
4817
|
+
["copilot-sdk::Write File", "Write"],
|
|
4818
|
+
["copilot-sdk::editFile", "Edit"],
|
|
4819
|
+
["copilot-sdk::Edit File", "Edit"],
|
|
4820
|
+
["copilot-sdk::runTerminalCommand", "Bash"],
|
|
4821
|
+
["copilot-log::Skill", "Skill"],
|
|
4822
|
+
["copilot-log::skill", "Skill"],
|
|
4823
|
+
["copilot-log::Read File", "Read"],
|
|
4824
|
+
["copilot-log::readFile", "Read"],
|
|
4825
|
+
["copilot-log::Read", "Read"],
|
|
4826
|
+
["copilot-log::readTextFile", "Read"],
|
|
4827
|
+
["copilot-log::writeTextFile", "Write"],
|
|
4828
|
+
["copilot-log::Write File", "Write"],
|
|
4829
|
+
["copilot-log::editFile", "Edit"],
|
|
4830
|
+
["copilot-log::Edit File", "Edit"],
|
|
4831
|
+
["copilot-log::runTerminalCommand", "Bash"],
|
|
4832
|
+
["vscode::Skill", "Skill"],
|
|
4833
|
+
["vscode::skill", "Skill"],
|
|
4834
|
+
["vscode::Read File", "Read"],
|
|
4835
|
+
["vscode::readFile", "Read"],
|
|
4836
|
+
["vscode::Read", "Read"],
|
|
4837
|
+
["vscode::readTextFile", "Read"],
|
|
4838
|
+
["vscode::writeTextFile", "Write"],
|
|
4839
|
+
["vscode::Write File", "Write"],
|
|
4840
|
+
["vscode::editFile", "Edit"],
|
|
4841
|
+
["vscode::Edit File", "Edit"],
|
|
4842
|
+
["vscode::runTerminalCommand", "Bash"],
|
|
4843
|
+
["vscode-insiders::Skill", "Skill"],
|
|
4844
|
+
["vscode-insiders::skill", "Skill"],
|
|
4845
|
+
["vscode-insiders::Read File", "Read"],
|
|
4846
|
+
["vscode-insiders::readFile", "Read"],
|
|
4847
|
+
["vscode-insiders::Read", "Read"],
|
|
4848
|
+
["vscode-insiders::readTextFile", "Read"],
|
|
4849
|
+
["vscode-insiders::writeTextFile", "Write"],
|
|
4850
|
+
["vscode-insiders::Write File", "Write"],
|
|
4851
|
+
["vscode-insiders::editFile", "Edit"],
|
|
4852
|
+
["vscode-insiders::Edit File", "Edit"],
|
|
4853
|
+
["vscode-insiders::runTerminalCommand", "Bash"],
|
|
4854
|
+
// --- Codex ---
|
|
4855
|
+
["codex::command_execution", "Bash"],
|
|
4856
|
+
["codex::file_change", "Edit"],
|
|
4857
|
+
// --- Pi ---
|
|
4858
|
+
["pi-coding-agent::read", "Read"],
|
|
4859
|
+
["pi-coding-agent::bash", "Bash"],
|
|
4860
|
+
["pi-cli::read", "Read"],
|
|
4861
|
+
["pi-cli::bash", "Bash"]
|
|
4862
|
+
]);
|
|
4863
|
+
var COPILOT_PREFIXES = [
|
|
4864
|
+
{ prefix: "Using skill: ", canonical: "Skill", extractSkillFromName: true },
|
|
4865
|
+
{ prefix: "Viewing ", canonical: "Read" }
|
|
4866
|
+
];
|
|
4867
|
+
var CODEX_PREFIXES = [
|
|
4868
|
+
{ prefix: "mcp:", canonical: "Skill", extractSkillFromName: true }
|
|
4869
|
+
];
|
|
4870
|
+
var TOOL_PREFIX_MAP = /* @__PURE__ */ new Map([
|
|
4871
|
+
["copilot-cli", COPILOT_PREFIXES],
|
|
4872
|
+
["copilot-sdk", COPILOT_PREFIXES],
|
|
4873
|
+
["copilot-log", COPILOT_PREFIXES],
|
|
4874
|
+
["vscode", COPILOT_PREFIXES],
|
|
4875
|
+
["vscode-insiders", COPILOT_PREFIXES],
|
|
4876
|
+
["codex", CODEX_PREFIXES]
|
|
4877
|
+
]);
|
|
4878
|
+
var normalizeSkillInput = (input) => {
|
|
4879
|
+
if (input.skill !== void 0) return input;
|
|
4880
|
+
return input;
|
|
4881
|
+
};
|
|
4882
|
+
var normalizeReadInput = (input) => {
|
|
4883
|
+
if (input.file_path !== void 0) return input;
|
|
4884
|
+
if (input.path !== void 0) return { ...input, file_path: input.path };
|
|
4885
|
+
if (input.filePath !== void 0) return { ...input, file_path: input.filePath };
|
|
4886
|
+
return input;
|
|
4887
|
+
};
|
|
4888
|
+
var INPUT_NORMALIZERS = /* @__PURE__ */ new Map([
|
|
4889
|
+
["Skill", normalizeSkillInput],
|
|
4890
|
+
["Read", normalizeReadInput]
|
|
4891
|
+
]);
|
|
4892
|
+
function normalizeToolCall(providerKind, tc) {
|
|
4893
|
+
const nativeName = tc.tool;
|
|
4894
|
+
const exactKey = `${providerKind}::${nativeName}`;
|
|
4895
|
+
const canonical = TOOL_NAME_MAP.get(exactKey);
|
|
4896
|
+
if (canonical) {
|
|
4897
|
+
return applyInputNormalization(canonical, { ...tc, tool: canonical });
|
|
4898
|
+
}
|
|
4899
|
+
const prefixRules = TOOL_PREFIX_MAP.get(providerKind);
|
|
4900
|
+
if (prefixRules) {
|
|
4901
|
+
for (const rule of prefixRules) {
|
|
4902
|
+
if (nativeName.startsWith(rule.prefix)) {
|
|
4903
|
+
const suffix = nativeName.slice(rule.prefix.length);
|
|
4904
|
+
let normalizedInput = tc.input;
|
|
4905
|
+
if (rule.extractSkillFromName && suffix) {
|
|
4906
|
+
const existingInput = tc.input ?? {};
|
|
4907
|
+
normalizedInput = { ...existingInput, skill: suffix };
|
|
4908
|
+
}
|
|
4909
|
+
const normalized = {
|
|
4910
|
+
...tc,
|
|
4911
|
+
tool: rule.canonical,
|
|
4912
|
+
input: normalizedInput
|
|
4913
|
+
};
|
|
4914
|
+
return applyInputNormalization(rule.canonical, normalized);
|
|
4915
|
+
}
|
|
4916
|
+
}
|
|
4917
|
+
}
|
|
4918
|
+
return tc;
|
|
4919
|
+
}
|
|
4920
|
+
function applyInputNormalization(canonical, tc) {
|
|
4921
|
+
const normalizer = INPUT_NORMALIZERS.get(canonical);
|
|
4922
|
+
if (!normalizer || tc.input === void 0 || tc.input === null) return tc;
|
|
4923
|
+
const input = tc.input;
|
|
4924
|
+
const normalized = normalizer(input);
|
|
4925
|
+
return normalized === input ? tc : { ...tc, input: normalized };
|
|
4926
|
+
}
|
|
4927
|
+
|
|
4748
4928
|
// src/evaluation/providers/preread.ts
|
|
4749
4929
|
import path10 from "node:path";
|
|
4750
4930
|
function buildPromptDocument(request, inputFiles) {
|
|
@@ -5212,11 +5392,13 @@ function extractToolCalls(content) {
|
|
|
5212
5392
|
}
|
|
5213
5393
|
const p = part;
|
|
5214
5394
|
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
5215
|
-
toolCalls.push(
|
|
5216
|
-
|
|
5217
|
-
|
|
5218
|
-
|
|
5219
|
-
|
|
5395
|
+
toolCalls.push(
|
|
5396
|
+
normalizeToolCall("claude-cli", {
|
|
5397
|
+
tool: p.name,
|
|
5398
|
+
input: p.input,
|
|
5399
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
5400
|
+
})
|
|
5401
|
+
);
|
|
5220
5402
|
}
|
|
5221
5403
|
}
|
|
5222
5404
|
return toolCalls;
|
|
@@ -5507,11 +5689,13 @@ function extractToolCalls2(content) {
|
|
|
5507
5689
|
}
|
|
5508
5690
|
const p = part;
|
|
5509
5691
|
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
5510
|
-
toolCalls.push(
|
|
5511
|
-
|
|
5512
|
-
|
|
5513
|
-
|
|
5514
|
-
|
|
5692
|
+
toolCalls.push(
|
|
5693
|
+
normalizeToolCall("claude-sdk", {
|
|
5694
|
+
tool: p.name,
|
|
5695
|
+
input: p.input,
|
|
5696
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
5697
|
+
})
|
|
5698
|
+
);
|
|
5515
5699
|
}
|
|
5516
5700
|
}
|
|
5517
5701
|
return toolCalls;
|
|
@@ -6426,27 +6610,33 @@ ${basePrompt}` : basePrompt;
|
|
|
6426
6610
|
}
|
|
6427
6611
|
}
|
|
6428
6612
|
if (itemType === "command_execution") {
|
|
6429
|
-
completedToolCalls.push(
|
|
6430
|
-
|
|
6431
|
-
|
|
6432
|
-
|
|
6433
|
-
|
|
6434
|
-
|
|
6613
|
+
completedToolCalls.push(
|
|
6614
|
+
normalizeToolCall("codex", {
|
|
6615
|
+
tool: "command_execution",
|
|
6616
|
+
input: { command: item.command },
|
|
6617
|
+
output: item.aggregated_output,
|
|
6618
|
+
id: item.id
|
|
6619
|
+
})
|
|
6620
|
+
);
|
|
6435
6621
|
}
|
|
6436
6622
|
if (itemType === "file_change") {
|
|
6437
|
-
completedToolCalls.push(
|
|
6438
|
-
|
|
6439
|
-
|
|
6440
|
-
|
|
6441
|
-
|
|
6623
|
+
completedToolCalls.push(
|
|
6624
|
+
normalizeToolCall("codex", {
|
|
6625
|
+
tool: "file_change",
|
|
6626
|
+
input: item.changes,
|
|
6627
|
+
id: item.id
|
|
6628
|
+
})
|
|
6629
|
+
);
|
|
6442
6630
|
}
|
|
6443
6631
|
if (itemType === "mcp_tool_call") {
|
|
6444
|
-
completedToolCalls.push(
|
|
6445
|
-
|
|
6446
|
-
|
|
6447
|
-
|
|
6448
|
-
|
|
6449
|
-
|
|
6632
|
+
completedToolCalls.push(
|
|
6633
|
+
normalizeToolCall("codex", {
|
|
6634
|
+
tool: `mcp:${item.server}/${item.tool}`,
|
|
6635
|
+
input: item.arguments,
|
|
6636
|
+
output: item.result ?? item.error,
|
|
6637
|
+
id: item.id
|
|
6638
|
+
})
|
|
6639
|
+
);
|
|
6450
6640
|
}
|
|
6451
6641
|
}
|
|
6452
6642
|
resolveCwd(cwdOverride) {
|
|
@@ -6981,12 +7171,6 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
|
|
|
6981
7171
|
return logger;
|
|
6982
7172
|
}
|
|
6983
7173
|
handleEvent(eventType, data) {
|
|
6984
|
-
if (this.format === "json") {
|
|
6985
|
-
const elapsed2 = formatElapsed4(this.startedAt);
|
|
6986
|
-
this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
|
|
6987
|
-
`);
|
|
6988
|
-
return;
|
|
6989
|
-
}
|
|
6990
7174
|
if (this.chunkExtractor) {
|
|
6991
7175
|
const chunkText = this.chunkExtractor(eventType, data);
|
|
6992
7176
|
if (chunkText === null) {
|
|
@@ -6999,6 +7183,12 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
|
|
|
6999
7183
|
}
|
|
7000
7184
|
this.flushPendingText();
|
|
7001
7185
|
}
|
|
7186
|
+
if (this.format === "json") {
|
|
7187
|
+
const elapsed2 = formatElapsed4(this.startedAt);
|
|
7188
|
+
this.stream.write(`${JSON.stringify({ time: elapsed2, event: eventType, data })}
|
|
7189
|
+
`);
|
|
7190
|
+
return;
|
|
7191
|
+
}
|
|
7002
7192
|
const elapsed = formatElapsed4(this.startedAt);
|
|
7003
7193
|
const summary = this.summarize(eventType, data);
|
|
7004
7194
|
if (summary) {
|
|
@@ -7009,14 +7199,19 @@ var CopilotStreamLogger = class _CopilotStreamLogger {
|
|
|
7009
7199
|
flushPendingText() {
|
|
7010
7200
|
if (!this.pendingText) return;
|
|
7011
7201
|
const elapsed = formatElapsed4(this.startedAt);
|
|
7012
|
-
this.
|
|
7202
|
+
if (this.format === "json") {
|
|
7203
|
+
this.stream.write(
|
|
7204
|
+
`${JSON.stringify({ time: elapsed, event: "assistant_message", data: { content: this.pendingText } })}
|
|
7205
|
+
`
|
|
7206
|
+
);
|
|
7207
|
+
} else {
|
|
7208
|
+
this.stream.write(`[+${elapsed}] [assistant_message] ${this.pendingText}
|
|
7013
7209
|
`);
|
|
7210
|
+
}
|
|
7014
7211
|
this.pendingText = "";
|
|
7015
7212
|
}
|
|
7016
7213
|
async close() {
|
|
7017
|
-
|
|
7018
|
-
this.flushPendingText();
|
|
7019
|
-
}
|
|
7214
|
+
this.flushPendingText();
|
|
7020
7215
|
await new Promise((resolve, reject) => {
|
|
7021
7216
|
this.stream.once("error", reject);
|
|
7022
7217
|
this.stream.end(() => resolve());
|
|
@@ -7091,15 +7286,17 @@ var CopilotCliProvider = class {
|
|
|
7091
7286
|
}
|
|
7092
7287
|
if (update.status === "completed" || update.status === "failed") {
|
|
7093
7288
|
const toolName = update.title ?? update.kind ?? "unknown";
|
|
7094
|
-
completedToolCalls.push(
|
|
7095
|
-
|
|
7096
|
-
|
|
7097
|
-
|
|
7098
|
-
|
|
7099
|
-
|
|
7100
|
-
|
|
7101
|
-
|
|
7102
|
-
|
|
7289
|
+
completedToolCalls.push(
|
|
7290
|
+
normalizeToolCall("copilot-cli", {
|
|
7291
|
+
tool: toolName,
|
|
7292
|
+
input: update.rawInput,
|
|
7293
|
+
output: update.rawOutput,
|
|
7294
|
+
id: callId,
|
|
7295
|
+
startTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7296
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7297
|
+
durationMs: 0
|
|
7298
|
+
})
|
|
7299
|
+
);
|
|
7103
7300
|
request.streamCallbacks?.onToolCallEnd?.(
|
|
7104
7301
|
toolName,
|
|
7105
7302
|
update.rawInput,
|
|
@@ -7116,15 +7313,17 @@ var CopilotCliProvider = class {
|
|
|
7116
7313
|
if (inProgress) {
|
|
7117
7314
|
toolCallsInProgress.delete(callId);
|
|
7118
7315
|
const duration = Date.now() - inProgress.startMs;
|
|
7119
|
-
completedToolCalls.push(
|
|
7120
|
-
|
|
7121
|
-
|
|
7122
|
-
|
|
7123
|
-
|
|
7124
|
-
|
|
7125
|
-
|
|
7126
|
-
|
|
7127
|
-
|
|
7316
|
+
completedToolCalls.push(
|
|
7317
|
+
normalizeToolCall("copilot-cli", {
|
|
7318
|
+
tool: inProgress.tool,
|
|
7319
|
+
input: inProgress.input,
|
|
7320
|
+
output: update.rawOutput,
|
|
7321
|
+
id: inProgress.id,
|
|
7322
|
+
startTime: inProgress.startTime,
|
|
7323
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7324
|
+
durationMs: duration
|
|
7325
|
+
})
|
|
7326
|
+
);
|
|
7128
7327
|
request.streamCallbacks?.onToolCallEnd?.(
|
|
7129
7328
|
inProgress.tool,
|
|
7130
7329
|
inProgress.input,
|
|
@@ -7468,11 +7667,13 @@ function parseCopilotEvents(eventsJsonl) {
|
|
|
7468
7667
|
}
|
|
7469
7668
|
case "assistant.message": {
|
|
7470
7669
|
const toolRequests = data.toolRequests;
|
|
7471
|
-
const toolCalls = (toolRequests ?? []).map(
|
|
7472
|
-
|
|
7473
|
-
|
|
7474
|
-
|
|
7475
|
-
|
|
7670
|
+
const toolCalls = (toolRequests ?? []).map(
|
|
7671
|
+
(req) => normalizeToolCall("copilot-log", {
|
|
7672
|
+
tool: String(req.name ?? req.toolName ?? ""),
|
|
7673
|
+
input: req.arguments,
|
|
7674
|
+
id: req.toolCallId ? String(req.toolCallId) : void 0
|
|
7675
|
+
})
|
|
7676
|
+
);
|
|
7476
7677
|
messages.push({
|
|
7477
7678
|
role: "assistant",
|
|
7478
7679
|
content: data.content != null ? String(data.content) : void 0,
|
|
@@ -7512,12 +7713,12 @@ function parseCopilotEvents(eventsJsonl) {
|
|
|
7512
7713
|
messages.push({
|
|
7513
7714
|
role: "assistant",
|
|
7514
7715
|
toolCalls: [
|
|
7515
|
-
{
|
|
7716
|
+
normalizeToolCall("copilot-log", {
|
|
7516
7717
|
tool: started.toolName,
|
|
7517
7718
|
input: started.input,
|
|
7518
7719
|
output: data.result,
|
|
7519
7720
|
id: toolCallId
|
|
7520
|
-
}
|
|
7721
|
+
})
|
|
7521
7722
|
]
|
|
7522
7723
|
});
|
|
7523
7724
|
}
|
|
@@ -7863,15 +8064,17 @@ var CopilotSdkProvider = class {
|
|
|
7863
8064
|
if (inProgress) {
|
|
7864
8065
|
toolCallsInProgress.delete(callId);
|
|
7865
8066
|
const endMs = Date.now();
|
|
7866
|
-
completedToolCalls.push(
|
|
7867
|
-
|
|
7868
|
-
|
|
7869
|
-
|
|
7870
|
-
|
|
7871
|
-
|
|
7872
|
-
|
|
7873
|
-
|
|
7874
|
-
|
|
8067
|
+
completedToolCalls.push(
|
|
8068
|
+
normalizeToolCall("copilot-sdk", {
|
|
8069
|
+
tool: inProgress.tool,
|
|
8070
|
+
input: inProgress.input,
|
|
8071
|
+
output: data?.output ?? data?.result,
|
|
8072
|
+
id: inProgress.id,
|
|
8073
|
+
startTime: inProgress.startTime,
|
|
8074
|
+
endTime: (/* @__PURE__ */ new Date()).toISOString(),
|
|
8075
|
+
durationMs: endMs - inProgress.startMs
|
|
8076
|
+
})
|
|
8077
|
+
);
|
|
7875
8078
|
}
|
|
7876
8079
|
}
|
|
7877
8080
|
if (eventType === "assistant.message") {
|
|
@@ -8850,12 +9053,14 @@ function extractToolCallsFromEvents(events) {
|
|
|
8850
9053
|
}
|
|
8851
9054
|
const toolCalls = [];
|
|
8852
9055
|
for (const [id, { tool: tool2, input }] of starts) {
|
|
8853
|
-
toolCalls.push(
|
|
8854
|
-
|
|
8855
|
-
|
|
8856
|
-
|
|
8857
|
-
|
|
8858
|
-
|
|
9056
|
+
toolCalls.push(
|
|
9057
|
+
normalizeToolCall("pi-cli", {
|
|
9058
|
+
tool: tool2,
|
|
9059
|
+
input,
|
|
9060
|
+
id: id.startsWith("anon-") ? void 0 : id,
|
|
9061
|
+
output: results.get(id)
|
|
9062
|
+
})
|
|
9063
|
+
);
|
|
8859
9064
|
}
|
|
8860
9065
|
return toolCalls;
|
|
8861
9066
|
}
|
|
@@ -8977,17 +9182,21 @@ function extractToolCalls3(content) {
|
|
|
8977
9182
|
if (!part || typeof part !== "object") continue;
|
|
8978
9183
|
const p = part;
|
|
8979
9184
|
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
8980
|
-
toolCalls.push(
|
|
8981
|
-
|
|
8982
|
-
|
|
8983
|
-
|
|
8984
|
-
|
|
9185
|
+
toolCalls.push(
|
|
9186
|
+
normalizeToolCall("pi-cli", {
|
|
9187
|
+
tool: p.name,
|
|
9188
|
+
input: p.input,
|
|
9189
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
9190
|
+
})
|
|
9191
|
+
);
|
|
8985
9192
|
} else if ((p.type === "toolCall" || p.type === "tool_call") && typeof p.name === "string") {
|
|
8986
|
-
toolCalls.push(
|
|
8987
|
-
|
|
8988
|
-
|
|
8989
|
-
|
|
8990
|
-
|
|
9193
|
+
toolCalls.push(
|
|
9194
|
+
normalizeToolCall("pi-cli", {
|
|
9195
|
+
tool: p.name,
|
|
9196
|
+
input: p.arguments ?? p.input,
|
|
9197
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
9198
|
+
})
|
|
9199
|
+
);
|
|
8991
9200
|
} else if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
8992
9201
|
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
|
|
8993
9202
|
if (existing) {
|
|
@@ -14066,100 +14275,35 @@ var LatencyEvaluator = class {
|
|
|
14066
14275
|
};
|
|
14067
14276
|
|
|
14068
14277
|
// src/evaluation/evaluators/skill-trigger.ts
|
|
14069
|
-
var CLAUDE_MATCHER = {
|
|
14070
|
-
skillTools: ["Skill"],
|
|
14071
|
-
skillInputField: "skill",
|
|
14072
|
-
readTools: ["Read"],
|
|
14073
|
-
readInputField: "file_path"
|
|
14074
|
-
};
|
|
14075
|
-
var COPILOT_MATCHER = {
|
|
14076
|
-
skillTools: ["Skill", "skill"],
|
|
14077
|
-
skillInputField: "skill",
|
|
14078
|
-
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
14079
|
-
readInputField: "file_path",
|
|
14080
|
-
skillToolPrefixes: ["Using skill: "],
|
|
14081
|
-
readToolPrefixes: ["Viewing "],
|
|
14082
|
-
readInputFields: ["file_path", "path"]
|
|
14083
|
-
};
|
|
14084
|
-
var PI_CODING_AGENT_MATCHER = {
|
|
14085
|
-
skillTools: [],
|
|
14086
|
-
skillInputField: "skill",
|
|
14087
|
-
readTools: ["read"],
|
|
14088
|
-
readInputField: "path",
|
|
14089
|
-
readInputFields: ["path", "file_path", "filePath"]
|
|
14090
|
-
};
|
|
14091
|
-
var CODEX_MATCHER = {
|
|
14092
|
-
skillTools: [],
|
|
14093
|
-
skillInputField: "skill",
|
|
14094
|
-
readTools: ["command_execution"],
|
|
14095
|
-
readInputField: "command",
|
|
14096
|
-
skillToolPrefixes: ["mcp:"],
|
|
14097
|
-
readToolPrefixes: ["mcp:"],
|
|
14098
|
-
readInputFields: ["command", "path", "file_path", "filePath"]
|
|
14099
|
-
};
|
|
14100
|
-
var PROVIDER_TOOL_SEMANTICS = {
|
|
14101
|
-
claude: CLAUDE_MATCHER,
|
|
14102
|
-
"claude-cli": CLAUDE_MATCHER,
|
|
14103
|
-
"claude-sdk": CLAUDE_MATCHER,
|
|
14104
|
-
codex: CODEX_MATCHER,
|
|
14105
|
-
"pi-coding-agent": PI_CODING_AGENT_MATCHER,
|
|
14106
|
-
"pi-cli": PI_CODING_AGENT_MATCHER,
|
|
14107
|
-
"copilot-cli": COPILOT_MATCHER,
|
|
14108
|
-
"copilot-log": COPILOT_MATCHER,
|
|
14109
|
-
"copilot-sdk": COPILOT_MATCHER,
|
|
14110
|
-
vscode: COPILOT_MATCHER,
|
|
14111
|
-
"vscode-insiders": COPILOT_MATCHER
|
|
14112
|
-
};
|
|
14113
14278
|
var SkillTriggerEvaluator = class {
|
|
14114
14279
|
kind = "skill-trigger";
|
|
14115
14280
|
config;
|
|
14116
14281
|
constructor(config) {
|
|
14117
14282
|
this.config = config;
|
|
14118
14283
|
}
|
|
14119
|
-
resolveMatcher(providerKind) {
|
|
14120
|
-
if (providerKind) {
|
|
14121
|
-
const match = PROVIDER_TOOL_SEMANTICS[providerKind];
|
|
14122
|
-
if (match) return match;
|
|
14123
|
-
}
|
|
14124
|
-
return CLAUDE_MATCHER;
|
|
14125
|
-
}
|
|
14126
14284
|
evaluate(context) {
|
|
14127
14285
|
const skillName = this.config.skill;
|
|
14128
14286
|
const shouldTrigger = this.config.should_trigger !== false;
|
|
14129
|
-
const providerKind = context.provider?.kind;
|
|
14130
|
-
const matcher = this.resolveMatcher(providerKind);
|
|
14131
14287
|
const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
|
|
14132
14288
|
let triggered = false;
|
|
14133
14289
|
let evidence = "";
|
|
14134
14290
|
for (const toolCall of allToolCalls) {
|
|
14135
14291
|
const toolName = toolCall.tool ?? "";
|
|
14136
14292
|
const input = toolCall.input ?? {};
|
|
14137
|
-
if (
|
|
14138
|
-
const skillArg = String(input
|
|
14293
|
+
if (toolName === "Skill") {
|
|
14294
|
+
const skillArg = String(input.skill ?? "");
|
|
14139
14295
|
if (skillArg.includes(skillName)) {
|
|
14140
14296
|
triggered = true;
|
|
14141
|
-
evidence = `Skill tool invoked with
|
|
14297
|
+
evidence = `Skill tool invoked with skill="${skillArg}"`;
|
|
14142
14298
|
break;
|
|
14143
14299
|
}
|
|
14144
|
-
} else if (
|
|
14145
|
-
|
|
14146
|
-
|
|
14147
|
-
triggered = true;
|
|
14148
|
-
evidence = `Skill tool invoked via tool name "${toolName}"`;
|
|
14149
|
-
break;
|
|
14150
|
-
} else if (matcher.readTools.includes(toolName)) {
|
|
14151
|
-
const filePath = this.readPathFromInput(input, matcher);
|
|
14152
|
-
if (filePath.includes(skillName)) {
|
|
14300
|
+
} else if (toolName === "Read") {
|
|
14301
|
+
const filePath = String(input.file_path ?? "");
|
|
14302
|
+
if (filePath.includes(`skills/${skillName}/`)) {
|
|
14153
14303
|
triggered = true;
|
|
14154
14304
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
14155
14305
|
break;
|
|
14156
14306
|
}
|
|
14157
|
-
} else if (matcher.readToolPrefixes?.some(
|
|
14158
|
-
(prefix) => toolName.startsWith(prefix) && toolName.includes(skillName)
|
|
14159
|
-
)) {
|
|
14160
|
-
triggered = true;
|
|
14161
|
-
evidence = `Read tool loaded skill file via tool name "${toolName}"`;
|
|
14162
|
-
break;
|
|
14163
14307
|
}
|
|
14164
14308
|
if (!triggered && toolCall.output != null) {
|
|
14165
14309
|
const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
|
|
@@ -14196,16 +14340,6 @@ var SkillTriggerEvaluator = class {
|
|
|
14196
14340
|
expectedAspectCount: 1
|
|
14197
14341
|
};
|
|
14198
14342
|
}
|
|
14199
|
-
readPathFromInput(input, matcher) {
|
|
14200
|
-
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
14201
|
-
for (const field of fields) {
|
|
14202
|
-
const value = input[field];
|
|
14203
|
-
if (value !== void 0 && value !== null) {
|
|
14204
|
-
return String(value);
|
|
14205
|
-
}
|
|
14206
|
-
}
|
|
14207
|
-
return "";
|
|
14208
|
-
}
|
|
14209
14343
|
};
|
|
14210
14344
|
|
|
14211
14345
|
// src/evaluation/evaluators/llm-grader-prompt.ts
|
|
@@ -15050,10 +15184,12 @@ function runEqualsAssertion(output, value) {
|
|
|
15050
15184
|
}
|
|
15051
15185
|
|
|
15052
15186
|
// src/evaluation/orchestrator.ts
|
|
15187
|
+
import { execFile as execFile3 } from "node:child_process";
|
|
15053
15188
|
import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
|
|
15054
15189
|
import { existsSync as existsSync5 } from "node:fs";
|
|
15055
15190
|
import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
15056
15191
|
import path45 from "node:path";
|
|
15192
|
+
import { promisify as promisify7 } from "node:util";
|
|
15057
15193
|
import micromatch3 from "micromatch";
|
|
15058
15194
|
|
|
15059
15195
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
@@ -16507,6 +16643,8 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
16507
16643
|
}
|
|
16508
16644
|
|
|
16509
16645
|
// src/evaluation/orchestrator.ts
|
|
16646
|
+
var execFileAsync3 = promisify7(execFile3);
|
|
16647
|
+
var WORKSPACE_GIT_TIMEOUT_MS = 3e5;
|
|
16510
16648
|
function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
|
|
16511
16649
|
return score >= threshold ? "ok" : "quality_failure";
|
|
16512
16650
|
}
|
|
@@ -16544,6 +16682,35 @@ function hasHookCommand(hook) {
|
|
|
16544
16682
|
function hooksEnabled(workspace) {
|
|
16545
16683
|
return workspace?.hooks?.enabled !== false;
|
|
16546
16684
|
}
|
|
16685
|
+
function workspaceGitEnv() {
|
|
16686
|
+
const env = { ...process.env };
|
|
16687
|
+
for (const key of Object.keys(env)) {
|
|
16688
|
+
if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
|
|
16689
|
+
delete env[key];
|
|
16690
|
+
}
|
|
16691
|
+
}
|
|
16692
|
+
return {
|
|
16693
|
+
...env,
|
|
16694
|
+
GIT_TERMINAL_PROMPT: "0",
|
|
16695
|
+
GIT_ASKPASS: "",
|
|
16696
|
+
GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
|
|
16697
|
+
};
|
|
16698
|
+
}
|
|
16699
|
+
async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
|
|
16700
|
+
if (!existsSync5(path45.join(workspacePath, ".git"))) {
|
|
16701
|
+
return false;
|
|
16702
|
+
}
|
|
16703
|
+
const cleanFlag = resetMode === "strict" ? "-fdx" : "-fd";
|
|
16704
|
+
const opts = {
|
|
16705
|
+
cwd: workspacePath,
|
|
16706
|
+
timeout: WORKSPACE_GIT_TIMEOUT_MS,
|
|
16707
|
+
env: workspaceGitEnv(),
|
|
16708
|
+
maxBuffer: 50 * 1024 * 1024
|
|
16709
|
+
};
|
|
16710
|
+
await execFileAsync3("git", ["reset", "--hard", baselineRef ?? "HEAD"], opts);
|
|
16711
|
+
await execFileAsync3("git", ["clean", cleanFlag], opts);
|
|
16712
|
+
return true;
|
|
16713
|
+
}
|
|
16547
16714
|
function getWorkspaceTemplate(target) {
|
|
16548
16715
|
const config = target.config;
|
|
16549
16716
|
if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
|
|
@@ -17805,6 +17972,37 @@ async function runEvalCase(options) {
|
|
|
17805
17972
|
}
|
|
17806
17973
|
}
|
|
17807
17974
|
}
|
|
17975
|
+
let beforeEachNeedsFreshBaseline = false;
|
|
17976
|
+
if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.before_each?.reset && evalCase.workspace.hooks.before_each.reset !== "none") {
|
|
17977
|
+
try {
|
|
17978
|
+
if (repoManager && evalCase.workspace.repos?.length) {
|
|
17979
|
+
await repoManager.reset(
|
|
17980
|
+
evalCase.workspace.repos,
|
|
17981
|
+
workspacePath,
|
|
17982
|
+
evalCase.workspace.hooks.before_each.reset
|
|
17983
|
+
);
|
|
17984
|
+
} else {
|
|
17985
|
+
await resetWorkspaceRoot(
|
|
17986
|
+
workspacePath,
|
|
17987
|
+
evalCase.workspace.hooks.before_each.reset,
|
|
17988
|
+
sharedBaselineCommit
|
|
17989
|
+
);
|
|
17990
|
+
}
|
|
17991
|
+
} catch (error) {
|
|
17992
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
17993
|
+
return buildErrorResult(
|
|
17994
|
+
evalCase,
|
|
17995
|
+
target.name,
|
|
17996
|
+
nowFn(),
|
|
17997
|
+
new Error(`before_each reset failed: ${message}`),
|
|
17998
|
+
promptInputs,
|
|
17999
|
+
provider,
|
|
18000
|
+
"setup",
|
|
18001
|
+
"script_error",
|
|
18002
|
+
verbose
|
|
18003
|
+
);
|
|
18004
|
+
}
|
|
18005
|
+
}
|
|
17808
18006
|
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
|
|
17809
18007
|
if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
|
|
17810
18008
|
const beforeEachHook = caseBeforeEachHook;
|
|
@@ -17821,6 +18019,7 @@ async function runEvalCase(options) {
|
|
|
17821
18019
|
toScriptConfig(beforeEachHook, "before_each", `test '${evalCase.id}'`),
|
|
17822
18020
|
scriptContext
|
|
17823
18021
|
);
|
|
18022
|
+
beforeEachNeedsFreshBaseline = true;
|
|
17824
18023
|
} catch (error) {
|
|
17825
18024
|
const message = error instanceof Error ? error.message : String(error);
|
|
17826
18025
|
return buildErrorResult(
|
|
@@ -17836,7 +18035,7 @@ async function runEvalCase(options) {
|
|
|
17836
18035
|
);
|
|
17837
18036
|
}
|
|
17838
18037
|
}
|
|
17839
|
-
let baselineCommit = sharedBaselineCommit;
|
|
18038
|
+
let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
|
|
17840
18039
|
if (!baselineCommit && workspacePath) {
|
|
17841
18040
|
try {
|
|
17842
18041
|
baselineCommit = await initializeBaseline(workspacePath);
|
|
@@ -17847,6 +18046,35 @@ async function runEvalCase(options) {
|
|
|
17847
18046
|
}
|
|
17848
18047
|
}
|
|
17849
18048
|
}
|
|
18049
|
+
if (evalCase.mode === "conversation" && evalCase.turns?.length) {
|
|
18050
|
+
const conversationResult = await runConversationMode({
|
|
18051
|
+
evalCase,
|
|
18052
|
+
provider,
|
|
18053
|
+
target,
|
|
18054
|
+
evaluators,
|
|
18055
|
+
typeRegistry,
|
|
18056
|
+
graderProvider,
|
|
18057
|
+
promptInputs,
|
|
18058
|
+
nowFn,
|
|
18059
|
+
signal,
|
|
18060
|
+
workspacePath,
|
|
18061
|
+
caseWorkspaceFile: caseWorkspaceFile ?? suiteWorkspaceFile,
|
|
18062
|
+
agentTimeoutMs,
|
|
18063
|
+
streamCallbacks: options.streamCallbacks,
|
|
18064
|
+
verbose,
|
|
18065
|
+
threshold: evalCase.threshold ?? caseThreshold,
|
|
18066
|
+
targetResolver,
|
|
18067
|
+
availableTargets
|
|
18068
|
+
});
|
|
18069
|
+
if (workspacePath && !isSharedWorkspace) {
|
|
18070
|
+
const shouldRetain = conversationResult.executionStatus === "ok" ? retainOnSuccess === "keep" || keepWorkspaces : retainOnFailure === "keep" || !forceCleanup && !keepWorkspaces;
|
|
18071
|
+
if (!shouldRetain) {
|
|
18072
|
+
await cleanupWorkspace(workspacePath).catch(() => {
|
|
18073
|
+
});
|
|
18074
|
+
}
|
|
18075
|
+
}
|
|
18076
|
+
return conversationResult;
|
|
18077
|
+
}
|
|
17850
18078
|
const caseStartMs = Date.now();
|
|
17851
18079
|
const attemptBudget = (maxRetries ?? 0) + 1;
|
|
17852
18080
|
let attempt = 0;
|
|
@@ -17961,13 +18189,21 @@ async function runEvalCase(options) {
|
|
|
17961
18189
|
${providerFileChanges}` : providerFileChanges;
|
|
17962
18190
|
}
|
|
17963
18191
|
const providerError = extractProviderError(providerResponse);
|
|
17964
|
-
if (caseHooksEnabled &&
|
|
18192
|
+
if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
|
|
17965
18193
|
try {
|
|
17966
|
-
|
|
17967
|
-
|
|
17968
|
-
|
|
17969
|
-
|
|
17970
|
-
|
|
18194
|
+
if (repoManager && evalCase.workspace.repos?.length) {
|
|
18195
|
+
await repoManager.reset(
|
|
18196
|
+
evalCase.workspace.repos,
|
|
18197
|
+
workspacePath,
|
|
18198
|
+
evalCase.workspace.hooks.after_each.reset
|
|
18199
|
+
);
|
|
18200
|
+
} else {
|
|
18201
|
+
await resetWorkspaceRoot(
|
|
18202
|
+
workspacePath,
|
|
18203
|
+
evalCase.workspace.hooks.after_each.reset,
|
|
18204
|
+
baselineCommit
|
|
18205
|
+
);
|
|
18206
|
+
}
|
|
17971
18207
|
} catch {
|
|
17972
18208
|
}
|
|
17973
18209
|
}
|
|
@@ -18583,6 +18819,276 @@ function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
|
18583
18819
|
"llm-grader": llmGrader
|
|
18584
18820
|
};
|
|
18585
18821
|
}
|
|
18822
|
+
async function runConversationMode(options) {
|
|
18823
|
+
const {
|
|
18824
|
+
evalCase,
|
|
18825
|
+
provider,
|
|
18826
|
+
target,
|
|
18827
|
+
evaluators,
|
|
18828
|
+
typeRegistry,
|
|
18829
|
+
graderProvider,
|
|
18830
|
+
promptInputs,
|
|
18831
|
+
nowFn,
|
|
18832
|
+
signal,
|
|
18833
|
+
workspacePath,
|
|
18834
|
+
caseWorkspaceFile,
|
|
18835
|
+
agentTimeoutMs,
|
|
18836
|
+
streamCallbacks,
|
|
18837
|
+
verbose,
|
|
18838
|
+
threshold,
|
|
18839
|
+
targetResolver,
|
|
18840
|
+
availableTargets
|
|
18841
|
+
} = options;
|
|
18842
|
+
const turns = evalCase.turns;
|
|
18843
|
+
const aggregation = evalCase.aggregation ?? "mean";
|
|
18844
|
+
const onTurnFailure = evalCase.on_turn_failure ?? "continue";
|
|
18845
|
+
const windowSize = evalCase.window_size;
|
|
18846
|
+
const history = [];
|
|
18847
|
+
for (const msg of evalCase.input) {
|
|
18848
|
+
const content = typeof msg.content === "string" ? msg.content : JSON.stringify(msg.content);
|
|
18849
|
+
history.push({ role: msg.role, content });
|
|
18850
|
+
}
|
|
18851
|
+
const turnScores = [];
|
|
18852
|
+
const allTurnScoreValues = [];
|
|
18853
|
+
let stopped = false;
|
|
18854
|
+
const caseStartMs = Date.now();
|
|
18855
|
+
for (let i = 0; i < turns.length; i++) {
|
|
18856
|
+
const turn = turns[i];
|
|
18857
|
+
const turnIndex = i + 1;
|
|
18858
|
+
if (stopped) {
|
|
18859
|
+
turnScores.push({
|
|
18860
|
+
name: `turn-${turnIndex}`,
|
|
18861
|
+
type: "rubrics",
|
|
18862
|
+
score: 0,
|
|
18863
|
+
verdict: "skip",
|
|
18864
|
+
assertions: [{ text: "Skipped due to previous turn failure", passed: false }]
|
|
18865
|
+
});
|
|
18866
|
+
allTurnScoreValues.push(0);
|
|
18867
|
+
continue;
|
|
18868
|
+
}
|
|
18869
|
+
const userContent = typeof turn.input === "string" ? turn.input : JSON.stringify(turn.input);
|
|
18870
|
+
history.push({ role: "user", content: userContent });
|
|
18871
|
+
const chatPromptForProvider = windowSize ? buildWindowedHistory(history, windowSize) : [...history];
|
|
18872
|
+
let response;
|
|
18873
|
+
try {
|
|
18874
|
+
response = await provider.invoke({
|
|
18875
|
+
question: userContent,
|
|
18876
|
+
chatPrompt: chatPromptForProvider,
|
|
18877
|
+
evalCaseId: `${evalCase.id}/turn-${turnIndex}`,
|
|
18878
|
+
signal,
|
|
18879
|
+
cwd: workspacePath,
|
|
18880
|
+
workspaceFile: caseWorkspaceFile,
|
|
18881
|
+
streamCallbacks
|
|
18882
|
+
});
|
|
18883
|
+
} catch (error) {
|
|
18884
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
18885
|
+
turnScores.push({
|
|
18886
|
+
name: `turn-${turnIndex}`,
|
|
18887
|
+
type: "rubrics",
|
|
18888
|
+
score: 0,
|
|
18889
|
+
verdict: "fail",
|
|
18890
|
+
assertions: [{ text: `Provider error: ${message}`, passed: false }]
|
|
18891
|
+
});
|
|
18892
|
+
allTurnScoreValues.push(0);
|
|
18893
|
+
if (onTurnFailure === "stop") stopped = true;
|
|
18894
|
+
continue;
|
|
18895
|
+
}
|
|
18896
|
+
const assistantContent = extractLastAssistantContent(response.output);
|
|
18897
|
+
history.push({ role: "assistant", content: assistantContent });
|
|
18898
|
+
if (!turn.assertions?.length && !turn.expected_output) {
|
|
18899
|
+
turnScores.push({
|
|
18900
|
+
name: `turn-${turnIndex}`,
|
|
18901
|
+
type: "rubrics",
|
|
18902
|
+
score: 1,
|
|
18903
|
+
verdict: "pass",
|
|
18904
|
+
assertions: []
|
|
18905
|
+
});
|
|
18906
|
+
allTurnScoreValues.push(1);
|
|
18907
|
+
continue;
|
|
18908
|
+
}
|
|
18909
|
+
const turnAssertions = buildTurnAssertions(turn);
|
|
18910
|
+
const turnEvalCase = {
|
|
18911
|
+
...evalCase,
|
|
18912
|
+
id: `${evalCase.id}/turn-${turnIndex}`,
|
|
18913
|
+
assertions: turnAssertions,
|
|
18914
|
+
input: buildTurnGraderInput(history, windowSize),
|
|
18915
|
+
expected_output: turn.expected_output ? [
|
|
18916
|
+
typeof turn.expected_output === "string" ? { content: turn.expected_output } : turn.expected_output
|
|
18917
|
+
] : [],
|
|
18918
|
+
// Clear conversation fields to prevent recursion
|
|
18919
|
+
mode: void 0,
|
|
18920
|
+
turns: void 0
|
|
18921
|
+
};
|
|
18922
|
+
const turnResult = await evaluateCandidate({
|
|
18923
|
+
evalCase: turnEvalCase,
|
|
18924
|
+
candidate: assistantContent,
|
|
18925
|
+
target,
|
|
18926
|
+
provider,
|
|
18927
|
+
evaluators,
|
|
18928
|
+
typeRegistry,
|
|
18929
|
+
promptInputs: {
|
|
18930
|
+
question: buildConversationContext(history, windowSize),
|
|
18931
|
+
chatPrompt: windowSize ? buildWindowedHistory(history, windowSize) : [...history]
|
|
18932
|
+
},
|
|
18933
|
+
nowFn,
|
|
18934
|
+
attempt: 0,
|
|
18935
|
+
graderProvider,
|
|
18936
|
+
agentTimeoutMs,
|
|
18937
|
+
output: response.output,
|
|
18938
|
+
verbose,
|
|
18939
|
+
threshold,
|
|
18940
|
+
targetResolver,
|
|
18941
|
+
availableTargets
|
|
18942
|
+
});
|
|
18943
|
+
const turnScore = turnResult.score;
|
|
18944
|
+
allTurnScoreValues.push(turnScore);
|
|
18945
|
+
turnScores.push({
|
|
18946
|
+
name: `turn-${turnIndex}`,
|
|
18947
|
+
type: "rubrics",
|
|
18948
|
+
score: turnScore,
|
|
18949
|
+
verdict: scoreToVerdict(turnScore, threshold ?? DEFAULT_THRESHOLD),
|
|
18950
|
+
assertions: turnResult.assertions ? [...turnResult.assertions] : [],
|
|
18951
|
+
scores: turnResult.scores
|
|
18952
|
+
});
|
|
18953
|
+
if (onTurnFailure === "stop" && turnScore < (threshold ?? DEFAULT_THRESHOLD)) {
|
|
18954
|
+
stopped = true;
|
|
18955
|
+
}
|
|
18956
|
+
}
|
|
18957
|
+
let conversationScores = [];
|
|
18958
|
+
if (evalCase.assertions?.length) {
|
|
18959
|
+
const conversationEvalCase = {
|
|
18960
|
+
...evalCase,
|
|
18961
|
+
id: `${evalCase.id}/conversation`,
|
|
18962
|
+
input: history.map((m) => ({
|
|
18963
|
+
role: m.role,
|
|
18964
|
+
content: m.content
|
|
18965
|
+
})),
|
|
18966
|
+
expected_output: [],
|
|
18967
|
+
mode: void 0,
|
|
18968
|
+
turns: void 0
|
|
18969
|
+
};
|
|
18970
|
+
const fullTranscript = history.map((m) => {
|
|
18971
|
+
const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
|
|
18972
|
+
return `${m.role}: ${content}`;
|
|
18973
|
+
}).join("\n\n");
|
|
18974
|
+
const conversationResult = await evaluateCandidate({
|
|
18975
|
+
evalCase: conversationEvalCase,
|
|
18976
|
+
candidate: fullTranscript,
|
|
18977
|
+
target,
|
|
18978
|
+
provider,
|
|
18979
|
+
evaluators,
|
|
18980
|
+
typeRegistry,
|
|
18981
|
+
promptInputs: {
|
|
18982
|
+
question: fullTranscript,
|
|
18983
|
+
chatPrompt: [...history]
|
|
18984
|
+
},
|
|
18985
|
+
nowFn,
|
|
18986
|
+
attempt: 0,
|
|
18987
|
+
graderProvider,
|
|
18988
|
+
agentTimeoutMs,
|
|
18989
|
+
verbose,
|
|
18990
|
+
threshold,
|
|
18991
|
+
targetResolver,
|
|
18992
|
+
availableTargets
|
|
18993
|
+
});
|
|
18994
|
+
conversationScores = [
|
|
18995
|
+
{
|
|
18996
|
+
name: "conversation",
|
|
18997
|
+
type: "rubrics",
|
|
18998
|
+
score: conversationResult.score,
|
|
18999
|
+
verdict: scoreToVerdict(
|
|
19000
|
+
conversationResult.score,
|
|
19001
|
+
threshold ?? DEFAULT_THRESHOLD
|
|
19002
|
+
),
|
|
19003
|
+
assertions: conversationResult.assertions ? [...conversationResult.assertions] : [],
|
|
19004
|
+
scores: conversationResult.scores
|
|
19005
|
+
}
|
|
19006
|
+
];
|
|
19007
|
+
}
|
|
19008
|
+
const allScoreValues = [...allTurnScoreValues, ...conversationScores.map((s) => s.score)];
|
|
19009
|
+
const finalScore = aggregateConversationScores(allScoreValues, aggregation);
|
|
19010
|
+
const allResultScores = [...turnScores, ...conversationScores];
|
|
19011
|
+
const outputMessages = history.map((m) => ({
|
|
19012
|
+
role: m.role,
|
|
19013
|
+
content: m.content
|
|
19014
|
+
}));
|
|
19015
|
+
const flatAssertions = allResultScores.flatMap((s) => [...s.assertions]);
|
|
19016
|
+
const totalDurationMs = Date.now() - caseStartMs;
|
|
19017
|
+
return {
|
|
19018
|
+
timestamp: nowFn().toISOString(),
|
|
19019
|
+
testId: evalCase.id,
|
|
19020
|
+
suite: evalCase.suite,
|
|
19021
|
+
category: evalCase.category,
|
|
19022
|
+
score: finalScore,
|
|
19023
|
+
assertions: flatAssertions,
|
|
19024
|
+
target: target.name,
|
|
19025
|
+
output: outputMessages,
|
|
19026
|
+
scores: allResultScores,
|
|
19027
|
+
executionStatus: classifyQualityStatus(finalScore, threshold ?? DEFAULT_THRESHOLD),
|
|
19028
|
+
input: evalCase.input.map((m) => ({
|
|
19029
|
+
role: m.role,
|
|
19030
|
+
content: typeof m.content === "string" ? m.content : JSON.stringify(m.content)
|
|
19031
|
+
})),
|
|
19032
|
+
evalRun: { durationMs: totalDurationMs }
|
|
19033
|
+
};
|
|
19034
|
+
}
|
|
19035
|
+
function buildWindowedHistory(history, windowSize) {
|
|
19036
|
+
const systemMessages = history.filter((m) => m.role === "system");
|
|
19037
|
+
const nonSystem = history.filter((m) => m.role !== "system");
|
|
19038
|
+
const windowed = nonSystem.slice(-windowSize * 2);
|
|
19039
|
+
return [...systemMessages, ...windowed];
|
|
19040
|
+
}
|
|
19041
|
+
function buildConversationContext(history, windowSize) {
|
|
19042
|
+
const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
|
|
19043
|
+
return msgs.map((m) => {
|
|
19044
|
+
const content = typeof m.content === "string" ? m.content : JSON.stringify(m.content);
|
|
19045
|
+
return `${m.role}: ${content}`;
|
|
19046
|
+
}).join("\n\n");
|
|
19047
|
+
}
|
|
19048
|
+
function buildTurnGraderInput(history, windowSize) {
|
|
19049
|
+
const msgs = windowSize ? buildWindowedHistory(history, windowSize) : history;
|
|
19050
|
+
return msgs.map((m) => ({
|
|
19051
|
+
role: m.role,
|
|
19052
|
+
content: m.content
|
|
19053
|
+
}));
|
|
19054
|
+
}
|
|
19055
|
+
function buildTurnAssertions(turn) {
|
|
19056
|
+
if (!turn.assertions?.length) return [];
|
|
19057
|
+
const stringCriteria = [];
|
|
19058
|
+
const structured = [];
|
|
19059
|
+
for (const a of turn.assertions) {
|
|
19060
|
+
if (typeof a === "string") {
|
|
19061
|
+
stringCriteria.push(a);
|
|
19062
|
+
} else {
|
|
19063
|
+
structured.push(a);
|
|
19064
|
+
}
|
|
19065
|
+
}
|
|
19066
|
+
const result = [];
|
|
19067
|
+
if (stringCriteria.length > 0) {
|
|
19068
|
+
result.push({
|
|
19069
|
+
name: "turn-rubrics",
|
|
19070
|
+
type: "llm-grader",
|
|
19071
|
+
rubrics: stringCriteria.map((text, idx) => ({
|
|
19072
|
+
id: `criterion-${idx + 1}`,
|
|
19073
|
+
outcome: text,
|
|
19074
|
+
weight: 1
|
|
19075
|
+
}))
|
|
19076
|
+
});
|
|
19077
|
+
}
|
|
19078
|
+
result.push(...structured);
|
|
19079
|
+
return result;
|
|
19080
|
+
}
|
|
19081
|
+
function aggregateConversationScores(scores, aggregation) {
|
|
19082
|
+
if (scores.length === 0) return 1;
|
|
19083
|
+
switch (aggregation) {
|
|
19084
|
+
case "min":
|
|
19085
|
+
return Math.min(...scores);
|
|
19086
|
+
case "max":
|
|
19087
|
+
return Math.max(...scores);
|
|
19088
|
+
default:
|
|
19089
|
+
return scores.reduce((sum, s) => sum + s, 0) / scores.length;
|
|
19090
|
+
}
|
|
19091
|
+
}
|
|
18586
19092
|
async function invokeProvider(provider, options) {
|
|
18587
19093
|
const {
|
|
18588
19094
|
evalCase,
|
|
@@ -19299,13 +19805,13 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
19299
19805
|
}
|
|
19300
19806
|
|
|
19301
19807
|
// src/evaluation/results-repo.ts
|
|
19302
|
-
import { execFile as
|
|
19808
|
+
import { execFile as execFile4 } from "node:child_process";
|
|
19303
19809
|
import { existsSync as existsSync7, mkdirSync as mkdirSync2, readFileSync as readFileSync3, rmSync, writeFileSync } from "node:fs";
|
|
19304
19810
|
import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir9, rm as rm6, stat as stat10 } from "node:fs/promises";
|
|
19305
19811
|
import os3 from "node:os";
|
|
19306
19812
|
import path49 from "node:path";
|
|
19307
|
-
import { promisify as
|
|
19308
|
-
var
|
|
19813
|
+
import { promisify as promisify8 } from "node:util";
|
|
19814
|
+
var execFileAsync4 = promisify8(execFile4);
|
|
19309
19815
|
function sanitizeRepoSlug(repo) {
|
|
19310
19816
|
return repo.trim().replace(/[^A-Za-z0-9._-]+/g, "-");
|
|
19311
19817
|
}
|
|
@@ -19356,7 +19862,7 @@ function writePersistedStatus(statusFile, status) {
|
|
|
19356
19862
|
}
|
|
19357
19863
|
async function runCommand(executable, args, options) {
|
|
19358
19864
|
try {
|
|
19359
|
-
const { stdout, stderr } = await
|
|
19865
|
+
const { stdout, stderr } = await execFileAsync4(executable, [...args], {
|
|
19360
19866
|
cwd: options?.cwd,
|
|
19361
19867
|
env: process.env
|
|
19362
19868
|
});
|
|
@@ -20404,11 +20910,13 @@ function extractAssistantContent(content) {
|
|
|
20404
20910
|
break;
|
|
20405
20911
|
case "tool_use":
|
|
20406
20912
|
if (block.name) {
|
|
20407
|
-
toolCalls.push(
|
|
20408
|
-
|
|
20409
|
-
|
|
20410
|
-
|
|
20411
|
-
|
|
20913
|
+
toolCalls.push(
|
|
20914
|
+
normalizeToolCall("claude", {
|
|
20915
|
+
tool: block.name,
|
|
20916
|
+
input: block.input,
|
|
20917
|
+
id: block.id
|
|
20918
|
+
})
|
|
20919
|
+
);
|
|
20412
20920
|
}
|
|
20413
20921
|
break;
|
|
20414
20922
|
}
|
|
@@ -20500,7 +21008,11 @@ function parseCodexSession(jsonl) {
|
|
|
20500
21008
|
} else {
|
|
20501
21009
|
input = payload.arguments;
|
|
20502
21010
|
}
|
|
20503
|
-
const toolCall =
|
|
21011
|
+
const toolCall = normalizeToolCall("codex", {
|
|
21012
|
+
tool: toolName,
|
|
21013
|
+
input,
|
|
21014
|
+
id: callId
|
|
21015
|
+
});
|
|
20504
21016
|
const msgIdx = messages.length;
|
|
20505
21017
|
messages.push({
|
|
20506
21018
|
role: "assistant",
|
|
@@ -20524,7 +21036,11 @@ function parseCodexSession(jsonl) {
|
|
|
20524
21036
|
} else {
|
|
20525
21037
|
input = payload.arguments;
|
|
20526
21038
|
}
|
|
20527
|
-
const toolCall =
|
|
21039
|
+
const toolCall = normalizeToolCall("codex", {
|
|
21040
|
+
tool: toolName,
|
|
21041
|
+
input,
|
|
21042
|
+
id: callId
|
|
21043
|
+
});
|
|
20528
21044
|
const msgIdx = messages.length;
|
|
20529
21045
|
messages.push({
|
|
20530
21046
|
role: "assistant",
|