@agentv/core 0.2.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-XXNQA4EW.js → chunk-NL7K4CAK.js} +5 -1
- package/dist/chunk-NL7K4CAK.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +186 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +183 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1701 -324
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +121 -63
- package/dist/index.d.ts +121 -63
- package/dist/index.js +1710 -327
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-XXNQA4EW.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -30,25 +30,20 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
QualityGrader: () => QualityGrader,
|
|
33
|
+
CodeEvaluator: () => CodeEvaluator,
|
|
34
|
+
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
36
35
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
36
|
buildDirectoryChain: () => buildDirectoryChain,
|
|
38
37
|
buildPromptInputs: () => buildPromptInputs,
|
|
39
38
|
buildSearchRoots: () => buildSearchRoots,
|
|
40
|
-
calculateHits: () => calculateHits,
|
|
41
|
-
calculateMisses: () => calculateMisses,
|
|
42
39
|
createAgentKernel: () => createAgentKernel,
|
|
43
40
|
createProvider: () => createProvider,
|
|
44
41
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
45
|
-
extractAspects: () => extractAspects,
|
|
46
42
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
47
43
|
fileExists: () => fileExists,
|
|
48
44
|
findGitRoot: () => findGitRoot,
|
|
49
45
|
getHitCount: () => getHitCount,
|
|
50
|
-
|
|
51
|
-
isGraderKind: () => isGraderKind,
|
|
46
|
+
isEvaluatorKind: () => isEvaluatorKind,
|
|
52
47
|
isGuidelineFile: () => isGuidelineFile,
|
|
53
48
|
isJsonObject: () => isJsonObject,
|
|
54
49
|
isJsonValue: () => isJsonValue,
|
|
@@ -61,8 +56,7 @@ __export(index_exports, {
|
|
|
61
56
|
resolveFileReference: () => resolveFileReference,
|
|
62
57
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
63
58
|
runEvalCase: () => runEvalCase,
|
|
64
|
-
runEvaluation: () => runEvaluation
|
|
65
|
-
scoreCandidateResponse: () => scoreCandidateResponse
|
|
59
|
+
runEvaluation: () => runEvaluation
|
|
66
60
|
});
|
|
67
61
|
module.exports = __toCommonJS(index_exports);
|
|
68
62
|
|
|
@@ -107,11 +101,10 @@ function isTestMessage(value) {
|
|
|
107
101
|
}
|
|
108
102
|
return candidate.content.every(isJsonObject);
|
|
109
103
|
}
|
|
110
|
-
var
|
|
111
|
-
var
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
return typeof value === "string" && GRADER_KIND_SET.has(value);
|
|
104
|
+
var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
|
|
105
|
+
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
106
|
+
function isEvaluatorKind(value) {
|
|
107
|
+
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
115
108
|
}
|
|
116
109
|
function getHitCount(result) {
|
|
117
110
|
return result.hits.length;
|
|
@@ -325,7 +318,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
325
318
|
if (!Array.isArray(rawTestcases)) {
|
|
326
319
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
327
320
|
}
|
|
328
|
-
const
|
|
321
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
329
322
|
const results = [];
|
|
330
323
|
for (const rawEvalcase of rawTestcases) {
|
|
331
324
|
if (!isJsonObject(rawEvalcase)) {
|
|
@@ -448,7 +441,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
448
441
|
const assistantContent = assistantMessages[0]?.content;
|
|
449
442
|
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
450
443
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
451
|
-
const
|
|
444
|
+
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
445
|
+
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
452
446
|
const userFilePaths = [];
|
|
453
447
|
for (const segment of userSegments) {
|
|
454
448
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -471,7 +465,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
471
465
|
file_paths: allFilePaths,
|
|
472
466
|
code_snippets: codeSnippets,
|
|
473
467
|
outcome,
|
|
474
|
-
|
|
468
|
+
evaluator: testCaseEvaluatorKind,
|
|
469
|
+
evaluators
|
|
475
470
|
};
|
|
476
471
|
if (verbose) {
|
|
477
472
|
console.log(`
|
|
@@ -632,14 +627,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
632
627
|
}
|
|
633
628
|
return parts.join(" ");
|
|
634
629
|
}
|
|
635
|
-
function
|
|
630
|
+
async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
631
|
+
const execution = rawEvalCase.execution;
|
|
632
|
+
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
|
|
633
|
+
if (candidateEvaluators === void 0) {
|
|
634
|
+
return void 0;
|
|
635
|
+
}
|
|
636
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
637
|
+
logWarning(`Skipping evaluators for '${evalId}': expected array`);
|
|
638
|
+
return void 0;
|
|
639
|
+
}
|
|
640
|
+
const evaluators = [];
|
|
641
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
642
|
+
if (!isJsonObject(rawEvaluator)) {
|
|
643
|
+
logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
644
|
+
continue;
|
|
645
|
+
}
|
|
646
|
+
const name = asString(rawEvaluator.name);
|
|
647
|
+
const typeValue = rawEvaluator.type;
|
|
648
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
649
|
+
logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
650
|
+
continue;
|
|
651
|
+
}
|
|
652
|
+
if (typeValue === "code") {
|
|
653
|
+
const script = asString(rawEvaluator.script);
|
|
654
|
+
if (!script) {
|
|
655
|
+
logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
656
|
+
continue;
|
|
657
|
+
}
|
|
658
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
659
|
+
let resolvedCwd;
|
|
660
|
+
if (cwd) {
|
|
661
|
+
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
662
|
+
if (resolved.resolvedPath) {
|
|
663
|
+
resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
664
|
+
} else {
|
|
665
|
+
logWarning(
|
|
666
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
667
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
668
|
+
);
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
evaluators.push({
|
|
672
|
+
name,
|
|
673
|
+
type: "code",
|
|
674
|
+
script,
|
|
675
|
+
cwd,
|
|
676
|
+
resolvedCwd
|
|
677
|
+
});
|
|
678
|
+
continue;
|
|
679
|
+
}
|
|
680
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
681
|
+
let promptPath;
|
|
682
|
+
if (prompt) {
|
|
683
|
+
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
684
|
+
if (resolved.resolvedPath) {
|
|
685
|
+
promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
686
|
+
} else {
|
|
687
|
+
logWarning(
|
|
688
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
689
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
690
|
+
);
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
const model = asString(rawEvaluator.model);
|
|
694
|
+
evaluators.push({
|
|
695
|
+
name,
|
|
696
|
+
type: "llm_judge",
|
|
697
|
+
prompt,
|
|
698
|
+
promptPath,
|
|
699
|
+
model
|
|
700
|
+
});
|
|
701
|
+
}
|
|
702
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
703
|
+
}
|
|
704
|
+
function coerceEvaluator(candidate, contextId) {
|
|
636
705
|
if (typeof candidate !== "string") {
|
|
637
706
|
return void 0;
|
|
638
707
|
}
|
|
639
|
-
if (
|
|
708
|
+
if (isEvaluatorKind(candidate)) {
|
|
640
709
|
return candidate;
|
|
641
710
|
}
|
|
642
|
-
logWarning(`Unknown
|
|
711
|
+
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
643
712
|
return void 0;
|
|
644
713
|
}
|
|
645
714
|
function logWarning(message, details) {
|
|
@@ -835,6 +904,214 @@ var GeminiProvider = class {
|
|
|
835
904
|
}
|
|
836
905
|
};
|
|
837
906
|
|
|
907
|
+
// src/evaluation/providers/cli.ts
|
|
908
|
+
var import_node_child_process = require("child_process");
|
|
909
|
+
var import_node_path3 = __toESM(require("path"), 1);
|
|
910
|
+
var import_node_util = require("util");
|
|
911
|
+
var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
|
|
912
|
+
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
913
|
+
async function defaultCommandRunner(command, options) {
|
|
914
|
+
const execOptions = {
|
|
915
|
+
cwd: options.cwd,
|
|
916
|
+
env: options.env,
|
|
917
|
+
timeout: options.timeoutMs,
|
|
918
|
+
signal: options.signal,
|
|
919
|
+
maxBuffer: DEFAULT_MAX_BUFFER,
|
|
920
|
+
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
921
|
+
};
|
|
922
|
+
try {
|
|
923
|
+
const { stdout, stderr } = await execAsync(command, execOptions);
|
|
924
|
+
return {
|
|
925
|
+
stdout,
|
|
926
|
+
stderr,
|
|
927
|
+
exitCode: 0,
|
|
928
|
+
failed: false,
|
|
929
|
+
timedOut: false,
|
|
930
|
+
signal: null
|
|
931
|
+
};
|
|
932
|
+
} catch (error) {
|
|
933
|
+
const execError = error;
|
|
934
|
+
return {
|
|
935
|
+
stdout: execError.stdout ?? "",
|
|
936
|
+
stderr: execError.stderr ?? "",
|
|
937
|
+
exitCode: typeof execError.code === "number" ? execError.code : null,
|
|
938
|
+
failed: true,
|
|
939
|
+
timedOut: execError.timedOut === true || execError.killed === true,
|
|
940
|
+
signal: execError.signal ?? null
|
|
941
|
+
};
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
var CliProvider = class {
|
|
945
|
+
id;
|
|
946
|
+
kind = "cli";
|
|
947
|
+
targetName;
|
|
948
|
+
supportsBatch = false;
|
|
949
|
+
config;
|
|
950
|
+
runCommand;
|
|
951
|
+
healthcheckPromise;
|
|
952
|
+
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
953
|
+
this.targetName = targetName;
|
|
954
|
+
this.id = `cli:${targetName}`;
|
|
955
|
+
this.config = config;
|
|
956
|
+
this.runCommand = runner;
|
|
957
|
+
}
|
|
958
|
+
async invoke(request) {
|
|
959
|
+
if (request.signal?.aborted) {
|
|
960
|
+
throw new Error("CLI provider request was aborted before execution");
|
|
961
|
+
}
|
|
962
|
+
await this.ensureHealthy(request.signal);
|
|
963
|
+
const templateValues = buildTemplateValues(request, this.config);
|
|
964
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
965
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
966
|
+
const result = await this.runCommand(renderedCommand, {
|
|
967
|
+
cwd: this.config.cwd,
|
|
968
|
+
env,
|
|
969
|
+
timeoutMs: this.config.timeoutMs,
|
|
970
|
+
signal: request.signal
|
|
971
|
+
});
|
|
972
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
973
|
+
if (request.signal?.aborted) {
|
|
974
|
+
throw new Error("CLI provider request was aborted");
|
|
975
|
+
}
|
|
976
|
+
if (result.timedOut) {
|
|
977
|
+
throw new Error(
|
|
978
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
979
|
+
);
|
|
980
|
+
}
|
|
981
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
982
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
983
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
984
|
+
throw new Error(message);
|
|
985
|
+
}
|
|
986
|
+
return {
|
|
987
|
+
text: result.stdout,
|
|
988
|
+
raw: {
|
|
989
|
+
command: renderedCommand,
|
|
990
|
+
stderr: result.stderr,
|
|
991
|
+
exitCode: result.exitCode ?? 0,
|
|
992
|
+
cwd: this.config.cwd
|
|
993
|
+
}
|
|
994
|
+
};
|
|
995
|
+
}
|
|
996
|
+
async ensureHealthy(signal) {
|
|
997
|
+
if (!this.config.healthcheck) {
|
|
998
|
+
return;
|
|
999
|
+
}
|
|
1000
|
+
if (!this.healthcheckPromise) {
|
|
1001
|
+
this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
|
|
1002
|
+
}
|
|
1003
|
+
return this.healthcheckPromise;
|
|
1004
|
+
}
|
|
1005
|
+
async runHealthcheck(healthcheck, signal) {
|
|
1006
|
+
if (!healthcheck) {
|
|
1007
|
+
return;
|
|
1008
|
+
}
|
|
1009
|
+
const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
|
|
1010
|
+
if (healthcheck.type === "http") {
|
|
1011
|
+
const controller = new AbortController();
|
|
1012
|
+
const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
|
|
1013
|
+
signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1014
|
+
try {
|
|
1015
|
+
const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
|
|
1016
|
+
if (!response.ok) {
|
|
1017
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}`);
|
|
1018
|
+
}
|
|
1019
|
+
} catch (error) {
|
|
1020
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
1021
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
|
|
1022
|
+
} finally {
|
|
1023
|
+
if (timer !== void 0) {
|
|
1024
|
+
clearTimeout(timer);
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
return;
|
|
1028
|
+
}
|
|
1029
|
+
const renderedCommand = renderTemplate(
|
|
1030
|
+
healthcheck.commandTemplate,
|
|
1031
|
+
buildTemplateValues(
|
|
1032
|
+
{
|
|
1033
|
+
prompt: "",
|
|
1034
|
+
guidelines: "",
|
|
1035
|
+
inputFiles: [],
|
|
1036
|
+
evalCaseId: "",
|
|
1037
|
+
attempt: 0
|
|
1038
|
+
},
|
|
1039
|
+
this.config
|
|
1040
|
+
)
|
|
1041
|
+
);
|
|
1042
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
1043
|
+
const result = await this.runCommand(renderedCommand, {
|
|
1044
|
+
cwd: healthcheck.cwd ?? this.config.cwd,
|
|
1045
|
+
env,
|
|
1046
|
+
timeoutMs,
|
|
1047
|
+
signal
|
|
1048
|
+
});
|
|
1049
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1050
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
1051
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
1052
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
|
|
1053
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
};
|
|
1057
|
+
function buildTemplateValues(request, config) {
|
|
1058
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
1059
|
+
return {
|
|
1060
|
+
PROMPT: shellEscape(request.prompt ?? ""),
|
|
1061
|
+
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
1062
|
+
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
1063
|
+
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
1064
|
+
FILES: formatFileList(inputFiles, config.filesFormat)
|
|
1065
|
+
};
|
|
1066
|
+
}
|
|
1067
|
+
function normalizeInputFiles(inputFiles) {
|
|
1068
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1069
|
+
return void 0;
|
|
1070
|
+
}
|
|
1071
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1072
|
+
for (const inputFile of inputFiles) {
|
|
1073
|
+
const absolutePath = import_node_path3.default.resolve(inputFile);
|
|
1074
|
+
if (!unique.has(absolutePath)) {
|
|
1075
|
+
unique.set(absolutePath, absolutePath);
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
return Array.from(unique.values());
|
|
1079
|
+
}
|
|
1080
|
+
function formatFileList(files, template) {
|
|
1081
|
+
if (!files || files.length === 0) {
|
|
1082
|
+
return "";
|
|
1083
|
+
}
|
|
1084
|
+
const formatter = template ?? "{path}";
|
|
1085
|
+
return files.map((filePath) => {
|
|
1086
|
+
const escapedPath = shellEscape(filePath);
|
|
1087
|
+
const escapedName = shellEscape(import_node_path3.default.basename(filePath));
|
|
1088
|
+
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
1089
|
+
}).join(" ");
|
|
1090
|
+
}
|
|
1091
|
+
function renderTemplate(template, values) {
|
|
1092
|
+
return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
|
|
1093
|
+
const replacement = values[key];
|
|
1094
|
+
return replacement !== void 0 ? replacement : match;
|
|
1095
|
+
});
|
|
1096
|
+
}
|
|
1097
|
+
function shellEscape(value) {
|
|
1098
|
+
if (value.length === 0) {
|
|
1099
|
+
return "''";
|
|
1100
|
+
}
|
|
1101
|
+
if (process.platform === "win32") {
|
|
1102
|
+
const escaped = value.replace(/"/g, '\\"');
|
|
1103
|
+
return `"${escaped}"`;
|
|
1104
|
+
}
|
|
1105
|
+
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
1106
|
+
}
|
|
1107
|
+
function formatTimeoutSuffix(timeoutMs) {
|
|
1108
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
1109
|
+
return "";
|
|
1110
|
+
}
|
|
1111
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
1112
|
+
return ` after ${seconds}s`;
|
|
1113
|
+
}
|
|
1114
|
+
|
|
838
1115
|
// src/evaluation/providers/mock.ts
|
|
839
1116
|
var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
|
|
840
1117
|
var MockProvider = class {
|
|
@@ -878,6 +1155,7 @@ var MockProvider = class {
|
|
|
878
1155
|
|
|
879
1156
|
// src/evaluation/providers/targets.ts
|
|
880
1157
|
var import_zod = require("zod");
|
|
1158
|
+
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
|
|
881
1159
|
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
882
1160
|
name: import_zod.z.string().min(1, "target name is required"),
|
|
883
1161
|
provider: import_zod.z.string().min(1, "provider is required"),
|
|
@@ -900,6 +1178,9 @@ function normalizeAzureApiVersion(value) {
|
|
|
900
1178
|
function resolveTargetDefinition(definition, env = process.env) {
|
|
901
1179
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
902
1180
|
const provider = parsed.provider.toLowerCase();
|
|
1181
|
+
const providerBatching = resolveOptionalBoolean(
|
|
1182
|
+
parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
|
|
1183
|
+
);
|
|
903
1184
|
switch (provider) {
|
|
904
1185
|
case "azure":
|
|
905
1186
|
case "azure-openai":
|
|
@@ -908,6 +1189,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
908
1189
|
name: parsed.name,
|
|
909
1190
|
judgeTarget: parsed.judge_target,
|
|
910
1191
|
workers: parsed.workers,
|
|
1192
|
+
providerBatching,
|
|
911
1193
|
config: resolveAzureConfig(parsed, env)
|
|
912
1194
|
};
|
|
913
1195
|
case "anthropic":
|
|
@@ -916,6 +1198,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
916
1198
|
name: parsed.name,
|
|
917
1199
|
judgeTarget: parsed.judge_target,
|
|
918
1200
|
workers: parsed.workers,
|
|
1201
|
+
providerBatching,
|
|
919
1202
|
config: resolveAnthropicConfig(parsed, env)
|
|
920
1203
|
};
|
|
921
1204
|
case "gemini":
|
|
@@ -926,14 +1209,26 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
926
1209
|
name: parsed.name,
|
|
927
1210
|
judgeTarget: parsed.judge_target,
|
|
928
1211
|
workers: parsed.workers,
|
|
1212
|
+
providerBatching,
|
|
929
1213
|
config: resolveGeminiConfig(parsed, env)
|
|
930
1214
|
};
|
|
1215
|
+
case "codex":
|
|
1216
|
+
case "codex-cli":
|
|
1217
|
+
return {
|
|
1218
|
+
kind: "codex",
|
|
1219
|
+
name: parsed.name,
|
|
1220
|
+
judgeTarget: parsed.judge_target,
|
|
1221
|
+
workers: parsed.workers,
|
|
1222
|
+
providerBatching,
|
|
1223
|
+
config: resolveCodexConfig(parsed, env)
|
|
1224
|
+
};
|
|
931
1225
|
case "mock":
|
|
932
1226
|
return {
|
|
933
1227
|
kind: "mock",
|
|
934
1228
|
name: parsed.name,
|
|
935
1229
|
judgeTarget: parsed.judge_target,
|
|
936
1230
|
workers: parsed.workers,
|
|
1231
|
+
providerBatching,
|
|
937
1232
|
config: resolveMockConfig(parsed)
|
|
938
1233
|
};
|
|
939
1234
|
case "vscode":
|
|
@@ -943,8 +1238,18 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
943
1238
|
name: parsed.name,
|
|
944
1239
|
judgeTarget: parsed.judge_target,
|
|
945
1240
|
workers: parsed.workers,
|
|
1241
|
+
providerBatching,
|
|
946
1242
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
947
1243
|
};
|
|
1244
|
+
case "cli":
|
|
1245
|
+
return {
|
|
1246
|
+
kind: "cli",
|
|
1247
|
+
name: parsed.name,
|
|
1248
|
+
judgeTarget: parsed.judge_target,
|
|
1249
|
+
workers: parsed.workers,
|
|
1250
|
+
providerBatching,
|
|
1251
|
+
config: resolveCliConfig(parsed, env)
|
|
1252
|
+
};
|
|
948
1253
|
default:
|
|
949
1254
|
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
950
1255
|
}
|
|
@@ -1012,6 +1317,29 @@ function resolveGeminiConfig(target, env) {
|
|
|
1012
1317
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
|
|
1013
1318
|
};
|
|
1014
1319
|
}
|
|
1320
|
+
function resolveCodexConfig(target, env) {
|
|
1321
|
+
const settings = target.settings ?? {};
|
|
1322
|
+
const executableSource = settings.executable ?? settings.command ?? settings.binary;
|
|
1323
|
+
const argsSource = settings.args ?? settings.arguments;
|
|
1324
|
+
const cwdSource = settings.cwd;
|
|
1325
|
+
const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
|
|
1326
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
1327
|
+
allowLiteral: true,
|
|
1328
|
+
optionalEnv: true
|
|
1329
|
+
}) ?? "codex";
|
|
1330
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
|
|
1331
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
|
|
1332
|
+
allowLiteral: true,
|
|
1333
|
+
optionalEnv: true
|
|
1334
|
+
});
|
|
1335
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
|
|
1336
|
+
return {
|
|
1337
|
+
executable,
|
|
1338
|
+
args,
|
|
1339
|
+
cwd,
|
|
1340
|
+
timeoutMs
|
|
1341
|
+
};
|
|
1342
|
+
}
|
|
1015
1343
|
function resolveMockConfig(target) {
|
|
1016
1344
|
const settings = target.settings ?? {};
|
|
1017
1345
|
const response = typeof settings.response === "string" ? settings.response : void 0;
|
|
@@ -1041,6 +1369,125 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
1041
1369
|
workspaceTemplate
|
|
1042
1370
|
};
|
|
1043
1371
|
}
|
|
1372
|
+
function resolveCliConfig(target, env) {
|
|
1373
|
+
const settings = target.settings ?? {};
|
|
1374
|
+
const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
|
|
1375
|
+
const filesFormat = resolveOptionalLiteralString(
|
|
1376
|
+
settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
|
|
1377
|
+
);
|
|
1378
|
+
const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
|
|
1379
|
+
allowLiteral: true,
|
|
1380
|
+
optionalEnv: true
|
|
1381
|
+
});
|
|
1382
|
+
const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
|
|
1383
|
+
const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
|
|
1384
|
+
const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
|
|
1385
|
+
const commandTemplate = resolveString(
|
|
1386
|
+
commandTemplateSource,
|
|
1387
|
+
env,
|
|
1388
|
+
`${target.name} CLI command template`,
|
|
1389
|
+
true
|
|
1390
|
+
);
|
|
1391
|
+
assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
|
|
1392
|
+
return {
|
|
1393
|
+
commandTemplate,
|
|
1394
|
+
filesFormat,
|
|
1395
|
+
cwd,
|
|
1396
|
+
env: envOverrides,
|
|
1397
|
+
timeoutMs,
|
|
1398
|
+
healthcheck
|
|
1399
|
+
};
|
|
1400
|
+
}
|
|
1401
|
+
function resolveEnvOverrides(source, env, targetName) {
|
|
1402
|
+
if (source === void 0 || source === null) {
|
|
1403
|
+
return void 0;
|
|
1404
|
+
}
|
|
1405
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
1406
|
+
throw new Error(`${targetName} env overrides must be an object map of strings`);
|
|
1407
|
+
}
|
|
1408
|
+
const entries = Object.entries(source);
|
|
1409
|
+
const resolved = {};
|
|
1410
|
+
for (const [key, value] of entries) {
|
|
1411
|
+
if (typeof value !== "string") {
|
|
1412
|
+
throw new Error(`${targetName} env override '${key}' must be a string`);
|
|
1413
|
+
}
|
|
1414
|
+
const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
|
|
1415
|
+
resolved[key] = resolvedValue;
|
|
1416
|
+
}
|
|
1417
|
+
return Object.keys(resolved).length > 0 ? resolved : void 0;
|
|
1418
|
+
}
|
|
1419
|
+
function resolveTimeoutMs(source, description) {
|
|
1420
|
+
const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
|
|
1421
|
+
if (seconds === void 0) {
|
|
1422
|
+
return void 0;
|
|
1423
|
+
}
|
|
1424
|
+
if (seconds <= 0) {
|
|
1425
|
+
throw new Error(`${description} must be greater than zero seconds`);
|
|
1426
|
+
}
|
|
1427
|
+
return Math.floor(seconds * 1e3);
|
|
1428
|
+
}
|
|
1429
|
+
function resolveCliHealthcheck(source, env, targetName) {
|
|
1430
|
+
if (source === void 0 || source === null) {
|
|
1431
|
+
return void 0;
|
|
1432
|
+
}
|
|
1433
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
1434
|
+
throw new Error(`${targetName} healthcheck must be an object`);
|
|
1435
|
+
}
|
|
1436
|
+
const candidate = source;
|
|
1437
|
+
const type = candidate.type;
|
|
1438
|
+
const timeoutMs = resolveTimeoutMs(
|
|
1439
|
+
candidate.timeout_seconds ?? candidate.timeoutSeconds,
|
|
1440
|
+
`${targetName} healthcheck timeout`
|
|
1441
|
+
);
|
|
1442
|
+
if (type === "http") {
|
|
1443
|
+
const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
|
|
1444
|
+
return {
|
|
1445
|
+
type: "http",
|
|
1446
|
+
url,
|
|
1447
|
+
timeoutMs
|
|
1448
|
+
};
|
|
1449
|
+
}
|
|
1450
|
+
if (type === "command") {
|
|
1451
|
+
const commandTemplate = resolveString(
|
|
1452
|
+
candidate.command_template ?? candidate.commandTemplate,
|
|
1453
|
+
env,
|
|
1454
|
+
`${targetName} healthcheck command template`,
|
|
1455
|
+
true
|
|
1456
|
+
);
|
|
1457
|
+
assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
|
|
1458
|
+
const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
|
|
1459
|
+
allowLiteral: true,
|
|
1460
|
+
optionalEnv: true
|
|
1461
|
+
});
|
|
1462
|
+
return {
|
|
1463
|
+
type: "command",
|
|
1464
|
+
commandTemplate,
|
|
1465
|
+
timeoutMs,
|
|
1466
|
+
cwd
|
|
1467
|
+
};
|
|
1468
|
+
}
|
|
1469
|
+
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
1470
|
+
}
|
|
1471
|
+
function assertSupportedCliPlaceholders(template, description) {
|
|
1472
|
+
const placeholders = extractCliPlaceholders(template);
|
|
1473
|
+
for (const placeholder of placeholders) {
|
|
1474
|
+
if (!CLI_PLACEHOLDERS.has(placeholder)) {
|
|
1475
|
+
throw new Error(
|
|
1476
|
+
`${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
|
|
1477
|
+
);
|
|
1478
|
+
}
|
|
1479
|
+
}
|
|
1480
|
+
}
|
|
1481
|
+
function extractCliPlaceholders(template) {
|
|
1482
|
+
const matches = template.matchAll(/\{([A-Z_]+)\}/g);
|
|
1483
|
+
const results = [];
|
|
1484
|
+
for (const match of matches) {
|
|
1485
|
+
if (match[1]) {
|
|
1486
|
+
results.push(match[1]);
|
|
1487
|
+
}
|
|
1488
|
+
}
|
|
1489
|
+
return results;
|
|
1490
|
+
}
|
|
1044
1491
|
function resolveString(source, env, description, allowLiteral = false) {
|
|
1045
1492
|
const value = resolveOptionalString(source, env, description, {
|
|
1046
1493
|
allowLiteral,
|
|
@@ -1071,11 +1518,14 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
1071
1518
|
}
|
|
1072
1519
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
1073
1520
|
const optionalEnv = options?.optionalEnv ?? false;
|
|
1074
|
-
|
|
1521
|
+
const looksLikeEnv = isLikelyEnvReference(trimmed);
|
|
1522
|
+
if (looksLikeEnv) {
|
|
1075
1523
|
if (optionalEnv) {
|
|
1076
1524
|
return void 0;
|
|
1077
1525
|
}
|
|
1078
|
-
|
|
1526
|
+
if (!allowLiteral) {
|
|
1527
|
+
throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
|
|
1528
|
+
}
|
|
1079
1529
|
}
|
|
1080
1530
|
return trimmed;
|
|
1081
1531
|
}
|
|
@@ -1125,15 +1575,48 @@ function resolveOptionalBoolean(source) {
|
|
|
1125
1575
|
function isLikelyEnvReference(value) {
|
|
1126
1576
|
return /^[A-Z0-9_]+$/.test(value);
|
|
1127
1577
|
}
|
|
1578
|
+
function resolveOptionalStringArray(source, env, description) {
|
|
1579
|
+
if (source === void 0 || source === null) {
|
|
1580
|
+
return void 0;
|
|
1581
|
+
}
|
|
1582
|
+
if (!Array.isArray(source)) {
|
|
1583
|
+
throw new Error(`${description} must be an array of strings`);
|
|
1584
|
+
}
|
|
1585
|
+
if (source.length === 0) {
|
|
1586
|
+
return void 0;
|
|
1587
|
+
}
|
|
1588
|
+
const resolved = [];
|
|
1589
|
+
for (let i = 0; i < source.length; i++) {
|
|
1590
|
+
const item = source[i];
|
|
1591
|
+
if (typeof item !== "string") {
|
|
1592
|
+
throw new Error(`${description}[${i}] must be a string`);
|
|
1593
|
+
}
|
|
1594
|
+
const trimmed = item.trim();
|
|
1595
|
+
if (trimmed.length === 0) {
|
|
1596
|
+
throw new Error(`${description}[${i}] cannot be empty`);
|
|
1597
|
+
}
|
|
1598
|
+
const envValue = env[trimmed];
|
|
1599
|
+
if (envValue !== void 0) {
|
|
1600
|
+
if (envValue.trim().length === 0) {
|
|
1601
|
+
throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
|
|
1602
|
+
}
|
|
1603
|
+
resolved.push(envValue);
|
|
1604
|
+
} else {
|
|
1605
|
+
resolved.push(trimmed);
|
|
1606
|
+
}
|
|
1607
|
+
}
|
|
1608
|
+
return resolved.length > 0 ? resolved : void 0;
|
|
1609
|
+
}
|
|
1128
1610
|
|
|
1129
1611
|
// src/evaluation/providers/vscode.ts
|
|
1130
1612
|
var import_promises3 = require("fs/promises");
|
|
1131
|
-
var
|
|
1613
|
+
var import_node_path4 = __toESM(require("path"), 1);
|
|
1132
1614
|
var import_subagent = require("subagent");
|
|
1133
1615
|
var VSCodeProvider = class {
|
|
1134
1616
|
id;
|
|
1135
1617
|
kind;
|
|
1136
1618
|
targetName;
|
|
1619
|
+
supportsBatch = true;
|
|
1137
1620
|
config;
|
|
1138
1621
|
constructor(targetName, config, kind) {
|
|
1139
1622
|
this.id = `${kind}:${targetName}`;
|
|
@@ -1145,12 +1628,11 @@ var VSCodeProvider = class {
|
|
|
1145
1628
|
if (request.signal?.aborted) {
|
|
1146
1629
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
1147
1630
|
}
|
|
1148
|
-
const
|
|
1149
|
-
const promptContent = buildPromptDocument(request,
|
|
1631
|
+
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
1632
|
+
const promptContent = buildPromptDocument(request, inputFiles, request.guideline_patterns);
|
|
1150
1633
|
const session = await (0, import_subagent.dispatchAgentSession)({
|
|
1151
1634
|
userQuery: promptContent,
|
|
1152
|
-
|
|
1153
|
-
extraAttachments: attachments,
|
|
1635
|
+
extraAttachments: inputFiles,
|
|
1154
1636
|
wait: this.config.waitForResponse,
|
|
1155
1637
|
dryRun: this.config.dryRun,
|
|
1156
1638
|
vscodeCmd: this.config.command,
|
|
@@ -1167,7 +1649,7 @@ var VSCodeProvider = class {
|
|
|
1167
1649
|
text: "",
|
|
1168
1650
|
raw: {
|
|
1169
1651
|
session,
|
|
1170
|
-
|
|
1652
|
+
inputFiles
|
|
1171
1653
|
}
|
|
1172
1654
|
};
|
|
1173
1655
|
}
|
|
@@ -1176,42 +1658,106 @@ var VSCodeProvider = class {
|
|
|
1176
1658
|
text: responseText,
|
|
1177
1659
|
raw: {
|
|
1178
1660
|
session,
|
|
1179
|
-
|
|
1661
|
+
inputFiles
|
|
1180
1662
|
}
|
|
1181
1663
|
};
|
|
1182
1664
|
}
|
|
1665
|
+
async invokeBatch(requests) {
|
|
1666
|
+
if (requests.length === 0) {
|
|
1667
|
+
return [];
|
|
1668
|
+
}
|
|
1669
|
+
const normalizedRequests = requests.map((req) => ({
|
|
1670
|
+
request: req,
|
|
1671
|
+
inputFiles: normalizeAttachments(req.inputFiles)
|
|
1672
|
+
}));
|
|
1673
|
+
const combinedInputFiles = mergeAttachments(
|
|
1674
|
+
normalizedRequests.map(({ inputFiles }) => inputFiles)
|
|
1675
|
+
);
|
|
1676
|
+
const userQueries = normalizedRequests.map(
|
|
1677
|
+
({ request, inputFiles }) => buildPromptDocument(request, inputFiles, request.guideline_patterns)
|
|
1678
|
+
);
|
|
1679
|
+
const session = await (0, import_subagent.dispatchBatchAgent)({
|
|
1680
|
+
userQueries,
|
|
1681
|
+
extraAttachments: combinedInputFiles,
|
|
1682
|
+
wait: this.config.waitForResponse,
|
|
1683
|
+
dryRun: this.config.dryRun,
|
|
1684
|
+
vscodeCmd: this.config.command,
|
|
1685
|
+
subagentRoot: this.config.subagentRoot,
|
|
1686
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1687
|
+
silent: true
|
|
1688
|
+
});
|
|
1689
|
+
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
1690
|
+
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
1691
|
+
throw new Error(failure);
|
|
1692
|
+
}
|
|
1693
|
+
if (this.config.dryRun) {
|
|
1694
|
+
return normalizedRequests.map(({ inputFiles }) => ({
|
|
1695
|
+
text: "",
|
|
1696
|
+
raw: {
|
|
1697
|
+
session,
|
|
1698
|
+
inputFiles,
|
|
1699
|
+
allInputFiles: combinedInputFiles
|
|
1700
|
+
}
|
|
1701
|
+
}));
|
|
1702
|
+
}
|
|
1703
|
+
if (session.responseFiles.length !== requests.length) {
|
|
1704
|
+
throw new Error(
|
|
1705
|
+
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
1706
|
+
);
|
|
1707
|
+
}
|
|
1708
|
+
const responses = [];
|
|
1709
|
+
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
1710
|
+
const responseText = await (0, import_promises3.readFile)(responseFile, "utf8");
|
|
1711
|
+
responses.push({
|
|
1712
|
+
text: responseText,
|
|
1713
|
+
raw: {
|
|
1714
|
+
session,
|
|
1715
|
+
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
1716
|
+
allInputFiles: combinedInputFiles,
|
|
1717
|
+
responseFile
|
|
1718
|
+
}
|
|
1719
|
+
});
|
|
1720
|
+
}
|
|
1721
|
+
return responses;
|
|
1722
|
+
}
|
|
1183
1723
|
};
|
|
1184
1724
|
function buildPromptDocument(request, attachments, guidelinePatterns) {
|
|
1185
1725
|
const parts = [];
|
|
1186
1726
|
const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
|
|
1187
|
-
|
|
1188
|
-
|
|
1727
|
+
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
1728
|
+
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
1729
|
+
(file) => !guidelineFiles.includes(file)
|
|
1730
|
+
);
|
|
1731
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
|
|
1732
|
+
if (prereadBlock.length > 0) {
|
|
1733
|
+
parts.push("\n", prereadBlock);
|
|
1189
1734
|
}
|
|
1190
1735
|
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1191
1736
|
return parts.join("\n").trim();
|
|
1192
1737
|
}
|
|
1193
|
-
function buildMandatoryPrereadBlock(guidelineFiles) {
|
|
1194
|
-
if (guidelineFiles.length === 0) {
|
|
1738
|
+
function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
|
|
1739
|
+
if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
|
|
1195
1740
|
return "";
|
|
1196
1741
|
}
|
|
1197
|
-
const
|
|
1198
|
-
|
|
1199
|
-
for (const absolutePath of guidelineFiles) {
|
|
1200
|
-
counter += 1;
|
|
1201
|
-
const fileName = import_node_path3.default.basename(absolutePath);
|
|
1742
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1743
|
+
const fileName = import_node_path4.default.basename(absolutePath);
|
|
1202
1744
|
const fileUri = pathToFileUri(absolutePath);
|
|
1203
|
-
|
|
1204
|
-
}
|
|
1205
|
-
const
|
|
1206
|
-
|
|
1207
|
-
`Read all guideline files:
|
|
1208
|
-
${
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1745
|
+
return `* [${fileName}](${fileUri})`;
|
|
1746
|
+
});
|
|
1747
|
+
const sections = [];
|
|
1748
|
+
if (guidelineFiles.length > 0) {
|
|
1749
|
+
sections.push(`Read all guideline files:
|
|
1750
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1751
|
+
}
|
|
1752
|
+
if (attachmentFiles.length > 0) {
|
|
1753
|
+
sections.push(`Read all attachment files:
|
|
1754
|
+
${buildList(attachmentFiles).join("\n")}.`);
|
|
1755
|
+
}
|
|
1756
|
+
sections.push(
|
|
1757
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1758
|
+
"Then apply system_instructions on the user query below."
|
|
1759
|
+
);
|
|
1760
|
+
return sections.join("\n");
|
|
1215
1761
|
}
|
|
1216
1762
|
function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
1217
1763
|
if (!attachments || attachments.length === 0) {
|
|
@@ -1219,8 +1765,8 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
|
1219
1765
|
}
|
|
1220
1766
|
const unique = /* @__PURE__ */ new Map();
|
|
1221
1767
|
for (const attachment of attachments) {
|
|
1222
|
-
const absolutePath =
|
|
1223
|
-
const normalized = absolutePath.split(
|
|
1768
|
+
const absolutePath = import_node_path4.default.resolve(attachment);
|
|
1769
|
+
const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
|
|
1224
1770
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1225
1771
|
if (!unique.has(absolutePath)) {
|
|
1226
1772
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1229,8 +1775,21 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
|
1229
1775
|
}
|
|
1230
1776
|
return Array.from(unique.values());
|
|
1231
1777
|
}
|
|
1778
|
+
function collectAttachmentFiles(attachments) {
|
|
1779
|
+
if (!attachments || attachments.length === 0) {
|
|
1780
|
+
return [];
|
|
1781
|
+
}
|
|
1782
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1783
|
+
for (const attachment of attachments) {
|
|
1784
|
+
const absolutePath = import_node_path4.default.resolve(attachment);
|
|
1785
|
+
if (!unique.has(absolutePath)) {
|
|
1786
|
+
unique.set(absolutePath, absolutePath);
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
return Array.from(unique.values());
|
|
1790
|
+
}
|
|
1232
1791
|
function pathToFileUri(filePath) {
|
|
1233
|
-
const absolutePath =
|
|
1792
|
+
const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
|
|
1234
1793
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1235
1794
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1236
1795
|
return `file:///${normalizedPath}`;
|
|
@@ -1243,10 +1802,20 @@ function normalizeAttachments(attachments) {
|
|
|
1243
1802
|
}
|
|
1244
1803
|
const deduped = /* @__PURE__ */ new Set();
|
|
1245
1804
|
for (const attachment of attachments) {
|
|
1246
|
-
deduped.add(
|
|
1805
|
+
deduped.add(import_node_path4.default.resolve(attachment));
|
|
1247
1806
|
}
|
|
1248
1807
|
return Array.from(deduped);
|
|
1249
1808
|
}
|
|
1809
|
+
function mergeAttachments(all) {
|
|
1810
|
+
const deduped = /* @__PURE__ */ new Set();
|
|
1811
|
+
for (const list of all) {
|
|
1812
|
+
if (!list) continue;
|
|
1813
|
+
for (const inputFile of list) {
|
|
1814
|
+
deduped.add(import_node_path4.default.resolve(inputFile));
|
|
1815
|
+
}
|
|
1816
|
+
}
|
|
1817
|
+
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
1818
|
+
}
|
|
1250
1819
|
async function ensureVSCodeSubagents(options) {
|
|
1251
1820
|
const { kind, count, verbose = false } = options;
|
|
1252
1821
|
const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
|
|
@@ -1274,22 +1843,598 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
1274
1843
|
provisioned: true,
|
|
1275
1844
|
message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
|
|
1276
1845
|
};
|
|
1277
|
-
} catch (error) {
|
|
1278
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1279
|
-
if (verbose) {
|
|
1280
|
-
console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
|
|
1846
|
+
} catch (error) {
|
|
1847
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1848
|
+
if (verbose) {
|
|
1849
|
+
console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
|
|
1850
|
+
}
|
|
1851
|
+
return {
|
|
1852
|
+
provisioned: false,
|
|
1853
|
+
message: `Provisioning failed: ${errorMessage}`
|
|
1854
|
+
};
|
|
1855
|
+
}
|
|
1856
|
+
}
|
|
1857
|
+
|
|
1858
|
+
// src/evaluation/providers/codex.ts
|
|
1859
|
+
var import_node_child_process2 = require("child_process");
|
|
1860
|
+
var import_node_fs3 = require("fs");
|
|
1861
|
+
var import_promises4 = require("fs/promises");
|
|
1862
|
+
var import_node_os = require("os");
|
|
1863
|
+
var import_node_path6 = __toESM(require("path"), 1);
|
|
1864
|
+
var import_node_util2 = require("util");
|
|
1865
|
+
|
|
1866
|
+
// src/evaluation/providers/preread.ts
|
|
1867
|
+
var import_node_path5 = __toESM(require("path"), 1);
|
|
1868
|
+
function buildPromptDocument2(request, inputFiles, options) {
|
|
1869
|
+
const parts = [];
|
|
1870
|
+
const guidelineFiles = collectGuidelineFiles2(
|
|
1871
|
+
inputFiles,
|
|
1872
|
+
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
1873
|
+
options?.guidelineOverrides
|
|
1874
|
+
);
|
|
1875
|
+
const inputFilesList = collectInputFiles(inputFiles);
|
|
1876
|
+
const nonGuidelineInputFiles = inputFilesList.filter(
|
|
1877
|
+
(file) => !guidelineFiles.includes(file)
|
|
1878
|
+
);
|
|
1879
|
+
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineInputFiles);
|
|
1880
|
+
if (prereadBlock.length > 0) {
|
|
1881
|
+
parts.push("\n", prereadBlock);
|
|
1882
|
+
}
|
|
1883
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1884
|
+
return parts.join("\n").trim();
|
|
1885
|
+
}
|
|
1886
|
+
function normalizeInputFiles2(inputFiles) {
|
|
1887
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1888
|
+
return void 0;
|
|
1889
|
+
}
|
|
1890
|
+
const deduped = /* @__PURE__ */ new Map();
|
|
1891
|
+
for (const inputFile of inputFiles) {
|
|
1892
|
+
const absolutePath = import_node_path5.default.resolve(inputFile);
|
|
1893
|
+
if (!deduped.has(absolutePath)) {
|
|
1894
|
+
deduped.set(absolutePath, absolutePath);
|
|
1895
|
+
}
|
|
1896
|
+
}
|
|
1897
|
+
return Array.from(deduped.values());
|
|
1898
|
+
}
|
|
1899
|
+
function collectGuidelineFiles2(inputFiles, guidelinePatterns, overrides) {
|
|
1900
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1901
|
+
return [];
|
|
1902
|
+
}
|
|
1903
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1904
|
+
for (const inputFile of inputFiles) {
|
|
1905
|
+
const absolutePath = import_node_path5.default.resolve(inputFile);
|
|
1906
|
+
if (overrides?.has(absolutePath)) {
|
|
1907
|
+
if (!unique.has(absolutePath)) {
|
|
1908
|
+
unique.set(absolutePath, absolutePath);
|
|
1909
|
+
}
|
|
1910
|
+
continue;
|
|
1911
|
+
}
|
|
1912
|
+
const normalized = absolutePath.split(import_node_path5.default.sep).join("/");
|
|
1913
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1914
|
+
if (!unique.has(absolutePath)) {
|
|
1915
|
+
unique.set(absolutePath, absolutePath);
|
|
1916
|
+
}
|
|
1917
|
+
}
|
|
1918
|
+
}
|
|
1919
|
+
return Array.from(unique.values());
|
|
1920
|
+
}
|
|
1921
|
+
function collectInputFiles(inputFiles) {
|
|
1922
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1923
|
+
return [];
|
|
1924
|
+
}
|
|
1925
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1926
|
+
for (const inputFile of inputFiles) {
|
|
1927
|
+
const absolutePath = import_node_path5.default.resolve(inputFile);
|
|
1928
|
+
if (!unique.has(absolutePath)) {
|
|
1929
|
+
unique.set(absolutePath, absolutePath);
|
|
1930
|
+
}
|
|
1931
|
+
}
|
|
1932
|
+
return Array.from(unique.values());
|
|
1933
|
+
}
|
|
1934
|
+
function buildMandatoryPrereadBlock2(guidelineFiles, inputFiles) {
|
|
1935
|
+
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
1936
|
+
return "";
|
|
1937
|
+
}
|
|
1938
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1939
|
+
const fileName = import_node_path5.default.basename(absolutePath);
|
|
1940
|
+
const fileUri = pathToFileUri2(absolutePath);
|
|
1941
|
+
return `* [${fileName}](${fileUri})`;
|
|
1942
|
+
});
|
|
1943
|
+
const sections = [];
|
|
1944
|
+
if (guidelineFiles.length > 0) {
|
|
1945
|
+
sections.push(`Read all guideline files:
|
|
1946
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1947
|
+
}
|
|
1948
|
+
if (inputFiles.length > 0) {
|
|
1949
|
+
sections.push(`Read all input files:
|
|
1950
|
+
${buildList(inputFiles).join("\n")}.`);
|
|
1951
|
+
}
|
|
1952
|
+
sections.push(
|
|
1953
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1954
|
+
"Then apply system_instructions on the user query below."
|
|
1955
|
+
);
|
|
1956
|
+
return sections.join("\n");
|
|
1957
|
+
}
|
|
1958
|
+
function pathToFileUri2(filePath) {
|
|
1959
|
+
const absolutePath = import_node_path5.default.isAbsolute(filePath) ? filePath : import_node_path5.default.resolve(filePath);
|
|
1960
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1961
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1962
|
+
return `file:///${normalizedPath}`;
|
|
1963
|
+
}
|
|
1964
|
+
return `file://${normalizedPath}`;
|
|
1965
|
+
}
|
|
1966
|
+
|
|
1967
|
+
// src/evaluation/providers/codex.ts
|
|
1968
|
+
var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
|
|
1969
|
+
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
1970
|
+
var PROMPT_FILENAME = "prompt.md";
|
|
1971
|
+
var FILES_DIR = "files";
|
|
1972
|
+
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
1973
|
+
var CodexProvider = class {
|
|
1974
|
+
id;
|
|
1975
|
+
kind = "codex";
|
|
1976
|
+
targetName;
|
|
1977
|
+
supportsBatch = false;
|
|
1978
|
+
config;
|
|
1979
|
+
runCodex;
|
|
1980
|
+
environmentCheck;
|
|
1981
|
+
resolvedExecutable;
|
|
1982
|
+
constructor(targetName, config, runner = defaultCodexRunner) {
|
|
1983
|
+
this.id = `codex:${targetName}`;
|
|
1984
|
+
this.targetName = targetName;
|
|
1985
|
+
this.config = config;
|
|
1986
|
+
this.runCodex = runner;
|
|
1987
|
+
}
|
|
1988
|
+
async invoke(request) {
|
|
1989
|
+
if (request.signal?.aborted) {
|
|
1990
|
+
throw new Error("Codex provider request was aborted before execution");
|
|
1991
|
+
}
|
|
1992
|
+
await this.ensureEnvironmentReady();
|
|
1993
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
1994
|
+
const originalGuidelines = new Set(
|
|
1995
|
+
collectGuidelineFiles2(inputFiles, request.guideline_patterns).map((file) => import_node_path6.default.resolve(file))
|
|
1996
|
+
);
|
|
1997
|
+
const workspaceRoot = await this.createWorkspace();
|
|
1998
|
+
try {
|
|
1999
|
+
const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
|
|
2000
|
+
inputFiles,
|
|
2001
|
+
workspaceRoot,
|
|
2002
|
+
originalGuidelines
|
|
2003
|
+
);
|
|
2004
|
+
const promptContent = buildPromptDocument2(request, mirroredInputFiles, {
|
|
2005
|
+
guidelinePatterns: request.guideline_patterns,
|
|
2006
|
+
guidelineOverrides: guidelineMirrors
|
|
2007
|
+
});
|
|
2008
|
+
const promptFile = import_node_path6.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
2009
|
+
await (0, import_promises4.writeFile)(promptFile, promptContent, "utf8");
|
|
2010
|
+
const args = this.buildCodexArgs();
|
|
2011
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
2012
|
+
const result = await this.executeCodex(args, cwd, promptContent, request.signal);
|
|
2013
|
+
if (result.timedOut) {
|
|
2014
|
+
throw new Error(
|
|
2015
|
+
`Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
2016
|
+
);
|
|
2017
|
+
}
|
|
2018
|
+
if (result.exitCode !== 0) {
|
|
2019
|
+
const detail = pickDetail(result.stderr, result.stdout);
|
|
2020
|
+
const prefix = `Codex CLI exited with code ${result.exitCode}`;
|
|
2021
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
2022
|
+
}
|
|
2023
|
+
const parsed = parseCodexJson(result.stdout);
|
|
2024
|
+
const assistantText = extractAssistantText(parsed);
|
|
2025
|
+
return {
|
|
2026
|
+
text: assistantText,
|
|
2027
|
+
raw: {
|
|
2028
|
+
response: parsed,
|
|
2029
|
+
stdout: result.stdout,
|
|
2030
|
+
stderr: result.stderr,
|
|
2031
|
+
exitCode: result.exitCode,
|
|
2032
|
+
args,
|
|
2033
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
2034
|
+
promptFile,
|
|
2035
|
+
workspace: workspaceRoot,
|
|
2036
|
+
inputFiles: mirroredInputFiles
|
|
2037
|
+
}
|
|
2038
|
+
};
|
|
2039
|
+
} finally {
|
|
2040
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
2041
|
+
}
|
|
2042
|
+
}
|
|
2043
|
+
async ensureEnvironmentReady() {
|
|
2044
|
+
if (!this.environmentCheck) {
|
|
2045
|
+
this.environmentCheck = this.validateEnvironment();
|
|
2046
|
+
}
|
|
2047
|
+
await this.environmentCheck;
|
|
2048
|
+
}
|
|
2049
|
+
async validateEnvironment() {
|
|
2050
|
+
this.resolvedExecutable = await locateExecutable(this.config.executable);
|
|
2051
|
+
}
|
|
2052
|
+
resolveCwd(workspaceRoot) {
|
|
2053
|
+
if (!this.config.cwd) {
|
|
2054
|
+
return workspaceRoot;
|
|
2055
|
+
}
|
|
2056
|
+
return import_node_path6.default.resolve(this.config.cwd);
|
|
2057
|
+
}
|
|
2058
|
+
buildCodexArgs() {
|
|
2059
|
+
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
2060
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
2061
|
+
args.push(...this.config.args);
|
|
2062
|
+
}
|
|
2063
|
+
args.push("-");
|
|
2064
|
+
return args;
|
|
2065
|
+
}
|
|
2066
|
+
async executeCodex(args, cwd, promptContent, signal) {
|
|
2067
|
+
try {
|
|
2068
|
+
return await this.runCodex({
|
|
2069
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
2070
|
+
args,
|
|
2071
|
+
cwd,
|
|
2072
|
+
prompt: promptContent,
|
|
2073
|
+
timeoutMs: this.config.timeoutMs,
|
|
2074
|
+
env: process.env,
|
|
2075
|
+
signal
|
|
2076
|
+
});
|
|
2077
|
+
} catch (error) {
|
|
2078
|
+
const err = error;
|
|
2079
|
+
if (err.code === "ENOENT") {
|
|
2080
|
+
throw new Error(
|
|
2081
|
+
`Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
2082
|
+
);
|
|
2083
|
+
}
|
|
2084
|
+
throw error;
|
|
2085
|
+
}
|
|
2086
|
+
}
|
|
2087
|
+
async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
|
|
2088
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
2089
|
+
return {
|
|
2090
|
+
mirroredInputFiles: void 0,
|
|
2091
|
+
guidelineMirrors: /* @__PURE__ */ new Set()
|
|
2092
|
+
};
|
|
2093
|
+
}
|
|
2094
|
+
const filesRoot = import_node_path6.default.join(workspaceRoot, FILES_DIR);
|
|
2095
|
+
await (0, import_promises4.mkdir)(filesRoot, { recursive: true });
|
|
2096
|
+
const mirrored = [];
|
|
2097
|
+
const guidelineMirrors = /* @__PURE__ */ new Set();
|
|
2098
|
+
const nameCounts = /* @__PURE__ */ new Map();
|
|
2099
|
+
for (const inputFile of inputFiles) {
|
|
2100
|
+
const absoluteSource = import_node_path6.default.resolve(inputFile);
|
|
2101
|
+
const baseName = import_node_path6.default.basename(absoluteSource);
|
|
2102
|
+
const count = nameCounts.get(baseName) ?? 0;
|
|
2103
|
+
nameCounts.set(baseName, count + 1);
|
|
2104
|
+
const finalName = count === 0 ? baseName : `${baseName}.${count}`;
|
|
2105
|
+
const destination = import_node_path6.default.join(filesRoot, finalName);
|
|
2106
|
+
await (0, import_promises4.copyFile)(absoluteSource, destination);
|
|
2107
|
+
const resolvedDestination = import_node_path6.default.resolve(destination);
|
|
2108
|
+
mirrored.push(resolvedDestination);
|
|
2109
|
+
if (guidelineOriginals.has(absoluteSource)) {
|
|
2110
|
+
guidelineMirrors.add(resolvedDestination);
|
|
2111
|
+
}
|
|
2112
|
+
}
|
|
2113
|
+
return {
|
|
2114
|
+
mirroredInputFiles: mirrored,
|
|
2115
|
+
guidelineMirrors
|
|
2116
|
+
};
|
|
2117
|
+
}
|
|
2118
|
+
async createWorkspace() {
|
|
2119
|
+
return await (0, import_promises4.mkdtemp)(import_node_path6.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
|
|
2120
|
+
}
|
|
2121
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
2122
|
+
try {
|
|
2123
|
+
await (0, import_promises4.rm)(workspaceRoot, { recursive: true, force: true });
|
|
2124
|
+
} catch {
|
|
2125
|
+
}
|
|
2126
|
+
}
|
|
2127
|
+
};
|
|
2128
|
+
async function locateExecutable(candidate) {
|
|
2129
|
+
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
2130
|
+
if (includesPathSeparator) {
|
|
2131
|
+
const resolved = import_node_path6.default.isAbsolute(candidate) ? candidate : import_node_path6.default.resolve(candidate);
|
|
2132
|
+
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
2133
|
+
await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2134
|
+
return executablePath;
|
|
2135
|
+
}
|
|
2136
|
+
const locator = process.platform === "win32" ? "where" : "which";
|
|
2137
|
+
try {
|
|
2138
|
+
const { stdout } = await execAsync2(`${locator} ${candidate}`);
|
|
2139
|
+
const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2140
|
+
const preferred = selectExecutableCandidate(lines);
|
|
2141
|
+
if (preferred) {
|
|
2142
|
+
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
2143
|
+
await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2144
|
+
return executablePath;
|
|
2145
|
+
}
|
|
2146
|
+
} catch {
|
|
2147
|
+
}
|
|
2148
|
+
throw new Error(`Codex executable '${candidate}' was not found on PATH`);
|
|
2149
|
+
}
|
|
2150
|
+
function selectExecutableCandidate(candidates) {
|
|
2151
|
+
if (candidates.length === 0) {
|
|
2152
|
+
return void 0;
|
|
2153
|
+
}
|
|
2154
|
+
if (process.platform !== "win32") {
|
|
2155
|
+
return candidates[0];
|
|
2156
|
+
}
|
|
2157
|
+
const extensions = getWindowsExecutableExtensions();
|
|
2158
|
+
for (const ext of extensions) {
|
|
2159
|
+
const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
|
|
2160
|
+
if (match) {
|
|
2161
|
+
return match;
|
|
2162
|
+
}
|
|
2163
|
+
}
|
|
2164
|
+
return candidates[0];
|
|
2165
|
+
}
|
|
2166
|
+
async function ensureWindowsExecutableVariant(candidate) {
|
|
2167
|
+
if (process.platform !== "win32") {
|
|
2168
|
+
return candidate;
|
|
2169
|
+
}
|
|
2170
|
+
if (hasExecutableExtension(candidate)) {
|
|
2171
|
+
return candidate;
|
|
2172
|
+
}
|
|
2173
|
+
const extensions = getWindowsExecutableExtensions();
|
|
2174
|
+
for (const ext of extensions) {
|
|
2175
|
+
const withExtension = `${candidate}${ext}`;
|
|
2176
|
+
try {
|
|
2177
|
+
await (0, import_promises4.access)(withExtension, import_node_fs3.constants.F_OK);
|
|
2178
|
+
return withExtension;
|
|
2179
|
+
} catch {
|
|
2180
|
+
}
|
|
2181
|
+
}
|
|
2182
|
+
return candidate;
|
|
2183
|
+
}
|
|
2184
|
+
function hasExecutableExtension(candidate) {
|
|
2185
|
+
const lower = candidate.toLowerCase();
|
|
2186
|
+
return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
|
|
2187
|
+
}
|
|
2188
|
+
var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
|
|
2189
|
+
function getWindowsExecutableExtensions() {
|
|
2190
|
+
if (process.platform !== "win32") {
|
|
2191
|
+
return [];
|
|
2192
|
+
}
|
|
2193
|
+
const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
|
|
2194
|
+
return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
|
|
2195
|
+
}
|
|
2196
|
+
function parseCodexJson(output) {
|
|
2197
|
+
const trimmed = output.trim();
|
|
2198
|
+
if (trimmed.length === 0) {
|
|
2199
|
+
throw new Error("Codex CLI produced no output in --json mode");
|
|
2200
|
+
}
|
|
2201
|
+
try {
|
|
2202
|
+
return JSON.parse(trimmed);
|
|
2203
|
+
} catch {
|
|
2204
|
+
const lineObjects = parseJsonLines(trimmed);
|
|
2205
|
+
if (lineObjects) {
|
|
2206
|
+
return lineObjects;
|
|
2207
|
+
}
|
|
2208
|
+
const lastBrace = trimmed.lastIndexOf("{");
|
|
2209
|
+
if (lastBrace >= 0) {
|
|
2210
|
+
const candidate = trimmed.slice(lastBrace);
|
|
2211
|
+
try {
|
|
2212
|
+
return JSON.parse(candidate);
|
|
2213
|
+
} catch {
|
|
2214
|
+
}
|
|
2215
|
+
}
|
|
2216
|
+
const preview = trimmed.slice(0, 200);
|
|
2217
|
+
throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
|
|
2218
|
+
}
|
|
2219
|
+
}
|
|
2220
|
+
function extractAssistantText(parsed) {
|
|
2221
|
+
if (Array.isArray(parsed)) {
|
|
2222
|
+
const text = extractFromEventStream(parsed);
|
|
2223
|
+
if (text) {
|
|
2224
|
+
return text;
|
|
2225
|
+
}
|
|
2226
|
+
}
|
|
2227
|
+
if (!parsed || typeof parsed !== "object") {
|
|
2228
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
2229
|
+
}
|
|
2230
|
+
const record = parsed;
|
|
2231
|
+
const eventText = extractFromEvent(record);
|
|
2232
|
+
if (eventText) {
|
|
2233
|
+
return eventText;
|
|
2234
|
+
}
|
|
2235
|
+
const messages = Array.isArray(record.messages) ? record.messages : void 0;
|
|
2236
|
+
if (messages) {
|
|
2237
|
+
for (let index = messages.length - 1; index >= 0; index -= 1) {
|
|
2238
|
+
const entry = messages[index];
|
|
2239
|
+
if (!entry || typeof entry !== "object") {
|
|
2240
|
+
continue;
|
|
2241
|
+
}
|
|
2242
|
+
const role = entry.role;
|
|
2243
|
+
if (role !== "assistant") {
|
|
2244
|
+
continue;
|
|
2245
|
+
}
|
|
2246
|
+
const content = entry.content;
|
|
2247
|
+
const flattened = flattenContent(content);
|
|
2248
|
+
if (flattened) {
|
|
2249
|
+
return flattened;
|
|
2250
|
+
}
|
|
2251
|
+
}
|
|
2252
|
+
}
|
|
2253
|
+
const response = record.response;
|
|
2254
|
+
if (response && typeof response === "object") {
|
|
2255
|
+
const content = response.content;
|
|
2256
|
+
const flattened = flattenContent(content);
|
|
2257
|
+
if (flattened) {
|
|
2258
|
+
return flattened;
|
|
2259
|
+
}
|
|
2260
|
+
}
|
|
2261
|
+
const output = record.output;
|
|
2262
|
+
const flattenedOutput = flattenContent(output);
|
|
2263
|
+
if (flattenedOutput) {
|
|
2264
|
+
return flattenedOutput;
|
|
2265
|
+
}
|
|
2266
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
2267
|
+
}
|
|
2268
|
+
function extractFromEventStream(events) {
|
|
2269
|
+
for (let index = events.length - 1; index >= 0; index -= 1) {
|
|
2270
|
+
const candidate = events[index];
|
|
2271
|
+
const text = extractFromEvent(candidate);
|
|
2272
|
+
if (text) {
|
|
2273
|
+
return text;
|
|
2274
|
+
}
|
|
2275
|
+
}
|
|
2276
|
+
return void 0;
|
|
2277
|
+
}
|
|
2278
|
+
function extractFromEvent(event) {
|
|
2279
|
+
if (!event || typeof event !== "object") {
|
|
2280
|
+
return void 0;
|
|
2281
|
+
}
|
|
2282
|
+
const record = event;
|
|
2283
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
2284
|
+
if (type === JSONL_TYPE_ITEM_COMPLETED) {
|
|
2285
|
+
const item = record.item;
|
|
2286
|
+
const text = extractFromItem(item);
|
|
2287
|
+
if (text) {
|
|
2288
|
+
return text;
|
|
2289
|
+
}
|
|
2290
|
+
}
|
|
2291
|
+
const output = record.output ?? record.content;
|
|
2292
|
+
const flattened = flattenContent(output);
|
|
2293
|
+
if (flattened) {
|
|
2294
|
+
return flattened;
|
|
2295
|
+
}
|
|
2296
|
+
return void 0;
|
|
2297
|
+
}
|
|
2298
|
+
function extractFromItem(item) {
|
|
2299
|
+
if (!item || typeof item !== "object") {
|
|
2300
|
+
return void 0;
|
|
2301
|
+
}
|
|
2302
|
+
const record = item;
|
|
2303
|
+
const itemType = typeof record.type === "string" ? record.type : void 0;
|
|
2304
|
+
if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
|
|
2305
|
+
const text = flattenContent(record.text ?? record.content ?? record.output);
|
|
2306
|
+
if (text) {
|
|
2307
|
+
return text;
|
|
2308
|
+
}
|
|
2309
|
+
}
|
|
2310
|
+
return void 0;
|
|
2311
|
+
}
|
|
2312
|
+
function flattenContent(value) {
|
|
2313
|
+
if (typeof value === "string") {
|
|
2314
|
+
return value;
|
|
2315
|
+
}
|
|
2316
|
+
if (Array.isArray(value)) {
|
|
2317
|
+
const parts = value.map((segment) => {
|
|
2318
|
+
if (typeof segment === "string") {
|
|
2319
|
+
return segment;
|
|
2320
|
+
}
|
|
2321
|
+
if (segment && typeof segment === "object" && "text" in segment) {
|
|
2322
|
+
const text = segment.text;
|
|
2323
|
+
return typeof text === "string" ? text : void 0;
|
|
2324
|
+
}
|
|
2325
|
+
return void 0;
|
|
2326
|
+
}).filter((part) => typeof part === "string" && part.length > 0);
|
|
2327
|
+
return parts.length > 0 ? parts.join(" \n") : void 0;
|
|
2328
|
+
}
|
|
2329
|
+
if (value && typeof value === "object" && "text" in value) {
|
|
2330
|
+
const text = value.text;
|
|
2331
|
+
return typeof text === "string" ? text : void 0;
|
|
2332
|
+
}
|
|
2333
|
+
return void 0;
|
|
2334
|
+
}
|
|
2335
|
+
function parseJsonLines(output) {
|
|
2336
|
+
const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2337
|
+
if (lines.length <= 1) {
|
|
2338
|
+
return void 0;
|
|
2339
|
+
}
|
|
2340
|
+
const parsed = [];
|
|
2341
|
+
for (const line of lines) {
|
|
2342
|
+
try {
|
|
2343
|
+
parsed.push(JSON.parse(line));
|
|
2344
|
+
} catch {
|
|
2345
|
+
return void 0;
|
|
2346
|
+
}
|
|
2347
|
+
}
|
|
2348
|
+
return parsed;
|
|
2349
|
+
}
|
|
2350
|
+
function pickDetail(stderr, stdout) {
|
|
2351
|
+
const errorText = stderr.trim();
|
|
2352
|
+
if (errorText.length > 0) {
|
|
2353
|
+
return errorText;
|
|
2354
|
+
}
|
|
2355
|
+
const stdoutText = stdout.trim();
|
|
2356
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
2357
|
+
}
|
|
2358
|
+
function formatTimeoutSuffix2(timeoutMs) {
|
|
2359
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
2360
|
+
return "";
|
|
2361
|
+
}
|
|
2362
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
2363
|
+
return ` after ${seconds}s`;
|
|
2364
|
+
}
|
|
2365
|
+
async function defaultCodexRunner(options) {
|
|
2366
|
+
return await new Promise((resolve, reject) => {
|
|
2367
|
+
const child = (0, import_node_child_process2.spawn)(options.executable, options.args, {
|
|
2368
|
+
cwd: options.cwd,
|
|
2369
|
+
env: options.env,
|
|
2370
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
2371
|
+
shell: shouldShellExecute(options.executable)
|
|
2372
|
+
});
|
|
2373
|
+
let stdout = "";
|
|
2374
|
+
let stderr = "";
|
|
2375
|
+
let timedOut = false;
|
|
2376
|
+
const onAbort = () => {
|
|
2377
|
+
child.kill("SIGTERM");
|
|
2378
|
+
};
|
|
2379
|
+
if (options.signal) {
|
|
2380
|
+
if (options.signal.aborted) {
|
|
2381
|
+
onAbort();
|
|
2382
|
+
} else {
|
|
2383
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
2384
|
+
}
|
|
1281
2385
|
}
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
2386
|
+
let timeoutHandle;
|
|
2387
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
2388
|
+
timeoutHandle = setTimeout(() => {
|
|
2389
|
+
timedOut = true;
|
|
2390
|
+
child.kill("SIGTERM");
|
|
2391
|
+
}, options.timeoutMs);
|
|
2392
|
+
timeoutHandle.unref?.();
|
|
2393
|
+
}
|
|
2394
|
+
child.stdout.setEncoding("utf8");
|
|
2395
|
+
child.stdout.on("data", (chunk) => {
|
|
2396
|
+
stdout += chunk;
|
|
2397
|
+
});
|
|
2398
|
+
child.stderr.setEncoding("utf8");
|
|
2399
|
+
child.stderr.on("data", (chunk) => {
|
|
2400
|
+
stderr += chunk;
|
|
2401
|
+
});
|
|
2402
|
+
child.stdin.end(options.prompt);
|
|
2403
|
+
const cleanup = () => {
|
|
2404
|
+
if (timeoutHandle) {
|
|
2405
|
+
clearTimeout(timeoutHandle);
|
|
2406
|
+
}
|
|
2407
|
+
if (options.signal) {
|
|
2408
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
2409
|
+
}
|
|
1285
2410
|
};
|
|
2411
|
+
child.on("error", (error) => {
|
|
2412
|
+
cleanup();
|
|
2413
|
+
reject(error);
|
|
2414
|
+
});
|
|
2415
|
+
child.on("close", (code) => {
|
|
2416
|
+
cleanup();
|
|
2417
|
+
resolve({
|
|
2418
|
+
stdout,
|
|
2419
|
+
stderr,
|
|
2420
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
2421
|
+
timedOut
|
|
2422
|
+
});
|
|
2423
|
+
});
|
|
2424
|
+
});
|
|
2425
|
+
}
|
|
2426
|
+
function shouldShellExecute(executable) {
|
|
2427
|
+
if (process.platform !== "win32") {
|
|
2428
|
+
return false;
|
|
1286
2429
|
}
|
|
2430
|
+
const lower = executable.toLowerCase();
|
|
2431
|
+
return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
|
|
1287
2432
|
}
|
|
1288
2433
|
|
|
1289
2434
|
// src/evaluation/providers/targets-file.ts
|
|
1290
|
-
var
|
|
1291
|
-
var
|
|
1292
|
-
var
|
|
2435
|
+
var import_node_fs4 = require("fs");
|
|
2436
|
+
var import_promises5 = require("fs/promises");
|
|
2437
|
+
var import_node_path7 = __toESM(require("path"), 1);
|
|
1293
2438
|
var import_yaml2 = require("yaml");
|
|
1294
2439
|
|
|
1295
2440
|
// src/evaluation/providers/types.ts
|
|
@@ -1350,18 +2495,18 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
1350
2495
|
}
|
|
1351
2496
|
async function fileExists3(filePath) {
|
|
1352
2497
|
try {
|
|
1353
|
-
await (0,
|
|
2498
|
+
await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
|
|
1354
2499
|
return true;
|
|
1355
2500
|
} catch {
|
|
1356
2501
|
return false;
|
|
1357
2502
|
}
|
|
1358
2503
|
}
|
|
1359
2504
|
async function readTargetDefinitions(filePath) {
|
|
1360
|
-
const absolutePath =
|
|
2505
|
+
const absolutePath = import_node_path7.default.resolve(filePath);
|
|
1361
2506
|
if (!await fileExists3(absolutePath)) {
|
|
1362
2507
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
1363
2508
|
}
|
|
1364
|
-
const raw = await (0,
|
|
2509
|
+
const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
|
|
1365
2510
|
const parsed = (0, import_yaml2.parse)(raw);
|
|
1366
2511
|
if (!isRecord(parsed)) {
|
|
1367
2512
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
@@ -1384,6 +2529,10 @@ function createProvider(target) {
|
|
|
1384
2529
|
return new AnthropicProvider(target.name, target.config);
|
|
1385
2530
|
case "gemini":
|
|
1386
2531
|
return new GeminiProvider(target.name, target.config);
|
|
2532
|
+
case "cli":
|
|
2533
|
+
return new CliProvider(target.name, target.config);
|
|
2534
|
+
case "codex":
|
|
2535
|
+
return new CodexProvider(target.name, target.config);
|
|
1387
2536
|
case "mock":
|
|
1388
2537
|
return new MockProvider(target.name, target.config);
|
|
1389
2538
|
case "vscode":
|
|
@@ -1400,230 +2549,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
1400
2549
|
return createProvider(resolved);
|
|
1401
2550
|
}
|
|
1402
2551
|
|
|
1403
|
-
// src/evaluation/
|
|
1404
|
-
var KEY_TERM_MATCH_THRESHOLD = 0.5;
|
|
1405
|
-
var ACTION_WORDS = /* @__PURE__ */ new Set([
|
|
1406
|
-
"use",
|
|
1407
|
-
"avoid",
|
|
1408
|
-
"prefer",
|
|
1409
|
-
"replace",
|
|
1410
|
-
"consider",
|
|
1411
|
-
"ensure",
|
|
1412
|
-
"remove",
|
|
1413
|
-
"add"
|
|
1414
|
-
]);
|
|
1415
|
-
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
1416
|
-
"the",
|
|
1417
|
-
"a",
|
|
1418
|
-
"an",
|
|
1419
|
-
"and",
|
|
1420
|
-
"or",
|
|
1421
|
-
"but",
|
|
1422
|
-
"in",
|
|
1423
|
-
"on",
|
|
1424
|
-
"at",
|
|
1425
|
-
"to",
|
|
1426
|
-
"for",
|
|
1427
|
-
"of",
|
|
1428
|
-
"with",
|
|
1429
|
-
"by",
|
|
1430
|
-
"is",
|
|
1431
|
-
"are",
|
|
1432
|
-
"was",
|
|
1433
|
-
"were",
|
|
1434
|
-
"be",
|
|
1435
|
-
"been",
|
|
1436
|
-
"being",
|
|
1437
|
-
"have",
|
|
1438
|
-
"has",
|
|
1439
|
-
"had",
|
|
1440
|
-
"do",
|
|
1441
|
-
"does",
|
|
1442
|
-
"did",
|
|
1443
|
-
"will",
|
|
1444
|
-
"would",
|
|
1445
|
-
"could",
|
|
1446
|
-
"should"
|
|
1447
|
-
]);
|
|
1448
|
-
var ERROR_PREFIXES = [
|
|
1449
|
-
"error:",
|
|
1450
|
-
"err:",
|
|
1451
|
-
"vs code command failed",
|
|
1452
|
-
"exception",
|
|
1453
|
-
"traceback",
|
|
1454
|
-
"no response file was generated",
|
|
1455
|
-
"timed out",
|
|
1456
|
-
"cli not found"
|
|
1457
|
-
];
|
|
1458
|
-
function extractAspects(expectedResponse) {
|
|
1459
|
-
const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
|
|
1460
|
-
const aspects = [];
|
|
1461
|
-
for (const line of lines) {
|
|
1462
|
-
if (line.length === 0) {
|
|
1463
|
-
continue;
|
|
1464
|
-
}
|
|
1465
|
-
const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
|
|
1466
|
-
if (bulletMatch) {
|
|
1467
|
-
const normalized = normalizeAspect(bulletMatch[2]);
|
|
1468
|
-
if (normalized.length > 0) {
|
|
1469
|
-
aspects.push(normalized);
|
|
1470
|
-
}
|
|
1471
|
-
continue;
|
|
1472
|
-
}
|
|
1473
|
-
const lowered = line.toLowerCase();
|
|
1474
|
-
if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
|
|
1475
|
-
const normalized = normalizeAspect(line);
|
|
1476
|
-
if (normalized.length > 0) {
|
|
1477
|
-
aspects.push(normalized);
|
|
1478
|
-
}
|
|
1479
|
-
}
|
|
1480
|
-
}
|
|
1481
|
-
return aspects;
|
|
1482
|
-
}
|
|
1483
|
-
function calculateHits(candidateResponse, expectedAspects) {
|
|
1484
|
-
const { normalizedText, words } = normalizeCandidate(candidateResponse);
|
|
1485
|
-
const hits = [];
|
|
1486
|
-
for (const aspect of expectedAspects) {
|
|
1487
|
-
if (matchesAspect(aspect, normalizedText, words)) {
|
|
1488
|
-
hits.push(aspect);
|
|
1489
|
-
}
|
|
1490
|
-
}
|
|
1491
|
-
return hits;
|
|
1492
|
-
}
|
|
1493
|
-
function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
|
|
1494
|
-
const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
|
|
1495
|
-
return expectedAspects.filter((aspect) => !hits.has(aspect));
|
|
1496
|
-
}
|
|
1497
|
-
function scoreCandidateResponse(candidateResponse, expectedAspects) {
|
|
1498
|
-
if (expectedAspects.length === 0) {
|
|
1499
|
-
if (isErrorLike(candidateResponse)) {
|
|
1500
|
-
return {
|
|
1501
|
-
score: 0,
|
|
1502
|
-
hits: [],
|
|
1503
|
-
misses: ["Model produced an error instead of an answer."],
|
|
1504
|
-
hitCount: 0,
|
|
1505
|
-
totalAspects: 0,
|
|
1506
|
-
rawAspects: []
|
|
1507
|
-
};
|
|
1508
|
-
}
|
|
1509
|
-
return {
|
|
1510
|
-
score: 1,
|
|
1511
|
-
hits: [],
|
|
1512
|
-
misses: [],
|
|
1513
|
-
hitCount: 0,
|
|
1514
|
-
totalAspects: 0,
|
|
1515
|
-
rawAspects: []
|
|
1516
|
-
};
|
|
1517
|
-
}
|
|
1518
|
-
const hits = calculateHits(candidateResponse, expectedAspects);
|
|
1519
|
-
const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
|
|
1520
|
-
const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
|
|
1521
|
-
return {
|
|
1522
|
-
score,
|
|
1523
|
-
hits,
|
|
1524
|
-
misses,
|
|
1525
|
-
hitCount: hits.length,
|
|
1526
|
-
totalAspects: expectedAspects.length,
|
|
1527
|
-
rawAspects: expectedAspects
|
|
1528
|
-
};
|
|
1529
|
-
}
|
|
1530
|
-
function isErrorLike(text) {
|
|
1531
|
-
if (!text) {
|
|
1532
|
-
return false;
|
|
1533
|
-
}
|
|
1534
|
-
const lowered = text.trim().toLowerCase();
|
|
1535
|
-
return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
|
|
1536
|
-
}
|
|
1537
|
-
function normalizeAspect(aspect) {
|
|
1538
|
-
const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
1539
|
-
return sanitized;
|
|
1540
|
-
}
|
|
1541
|
-
function normalizeCandidate(candidate) {
|
|
1542
|
-
const lowered = candidate.toLowerCase();
|
|
1543
|
-
const normalizedText = lowered.replace(/[^\w\s]/g, " ");
|
|
1544
|
-
const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
|
|
1545
|
-
return { normalizedText, words };
|
|
1546
|
-
}
|
|
1547
|
-
function matchesAspect(aspect, candidateNormalized, candidateWords) {
|
|
1548
|
-
const keyTerms = extractKeyTerms(aspect);
|
|
1549
|
-
if (keyTerms.length === 0) {
|
|
1550
|
-
return false;
|
|
1551
|
-
}
|
|
1552
|
-
const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
|
|
1553
|
-
const ratio = matches / keyTerms.length;
|
|
1554
|
-
if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
|
|
1555
|
-
return true;
|
|
1556
|
-
}
|
|
1557
|
-
const aspectWords = aspect.split(" ");
|
|
1558
|
-
if (aspectWords.length >= 2) {
|
|
1559
|
-
for (let index = 0; index < aspectWords.length - 1; index += 1) {
|
|
1560
|
-
const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
|
|
1561
|
-
if (candidateNormalized.includes(phrase)) {
|
|
1562
|
-
return true;
|
|
1563
|
-
}
|
|
1564
|
-
}
|
|
1565
|
-
}
|
|
1566
|
-
return false;
|
|
1567
|
-
}
|
|
1568
|
-
function extractKeyTerms(aspect, maxTerms = 5) {
|
|
1569
|
-
const terms = [];
|
|
1570
|
-
const words = aspect.split(" ");
|
|
1571
|
-
for (const word of words) {
|
|
1572
|
-
if (word.length <= 2) {
|
|
1573
|
-
continue;
|
|
1574
|
-
}
|
|
1575
|
-
if (STOP_WORDS.has(word)) {
|
|
1576
|
-
continue;
|
|
1577
|
-
}
|
|
1578
|
-
terms.push(word);
|
|
1579
|
-
if (terms.length >= maxTerms) {
|
|
1580
|
-
break;
|
|
1581
|
-
}
|
|
1582
|
-
}
|
|
1583
|
-
return terms;
|
|
1584
|
-
}
|
|
1585
|
-
|
|
1586
|
-
// src/evaluation/grading.ts
|
|
2552
|
+
// src/evaluation/evaluators.ts
|
|
1587
2553
|
var import_node_crypto = require("crypto");
|
|
1588
|
-
var
|
|
1589
|
-
kind = "heuristic";
|
|
1590
|
-
grade(context) {
|
|
1591
|
-
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1592
|
-
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1593
|
-
const misses = [...result.misses];
|
|
1594
|
-
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
1595
|
-
const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
|
|
1596
|
-
if (firstLine && !misses.includes(firstLine)) {
|
|
1597
|
-
misses.unshift(firstLine);
|
|
1598
|
-
}
|
|
1599
|
-
}
|
|
1600
|
-
return {
|
|
1601
|
-
score: result.score,
|
|
1602
|
-
hits: result.hits,
|
|
1603
|
-
misses,
|
|
1604
|
-
expectedAspectCount: result.totalAspects,
|
|
1605
|
-
rawAspects: result.rawAspects
|
|
1606
|
-
};
|
|
1607
|
-
}
|
|
1608
|
-
};
|
|
1609
|
-
var QualityGrader = class {
|
|
2554
|
+
var LlmJudgeEvaluator = class {
|
|
1610
2555
|
kind = "llm_judge";
|
|
1611
2556
|
resolveJudgeProvider;
|
|
1612
2557
|
maxOutputTokens;
|
|
1613
2558
|
temperature;
|
|
2559
|
+
customPrompt;
|
|
1614
2560
|
constructor(options) {
|
|
1615
2561
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
1616
2562
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
1617
2563
|
this.temperature = options.temperature;
|
|
2564
|
+
this.customPrompt = options.customPrompt;
|
|
1618
2565
|
}
|
|
1619
|
-
async
|
|
2566
|
+
async evaluate(context) {
|
|
1620
2567
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
1621
2568
|
if (!judgeProvider) {
|
|
1622
2569
|
throw new Error("No judge provider available for LLM grading");
|
|
1623
2570
|
}
|
|
1624
2571
|
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2572
|
+
const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
1625
2573
|
const metadata = {
|
|
1626
|
-
systemPrompt:
|
|
2574
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2575
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1627
2576
|
};
|
|
1628
2577
|
const response = await judgeProvider.invoke({
|
|
1629
2578
|
prompt,
|
|
@@ -1638,12 +2587,13 @@ var QualityGrader = class {
|
|
|
1638
2587
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1639
2588
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1640
2589
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
1641
|
-
const
|
|
2590
|
+
const evaluatorRawRequest = {
|
|
1642
2591
|
id: (0, import_node_crypto.randomUUID)(),
|
|
1643
2592
|
provider: judgeProvider.id,
|
|
1644
2593
|
prompt,
|
|
1645
|
-
|
|
1646
|
-
|
|
2594
|
+
target: context.target.name,
|
|
2595
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2596
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1647
2597
|
};
|
|
1648
2598
|
return {
|
|
1649
2599
|
score,
|
|
@@ -1651,7 +2601,7 @@ var QualityGrader = class {
|
|
|
1651
2601
|
misses,
|
|
1652
2602
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
1653
2603
|
reasoning,
|
|
1654
|
-
|
|
2604
|
+
evaluatorRawRequest
|
|
1655
2605
|
};
|
|
1656
2606
|
}
|
|
1657
2607
|
};
|
|
@@ -1769,11 +2719,117 @@ function extractJsonBlob(text) {
|
|
|
1769
2719
|
function isNonEmptyString(value) {
|
|
1770
2720
|
return typeof value === "string" && value.trim().length > 0;
|
|
1771
2721
|
}
|
|
2722
|
+
var CodeEvaluator = class {
|
|
2723
|
+
kind = "code";
|
|
2724
|
+
script;
|
|
2725
|
+
cwd;
|
|
2726
|
+
agentTimeoutMs;
|
|
2727
|
+
constructor(options) {
|
|
2728
|
+
this.script = options.script;
|
|
2729
|
+
this.cwd = options.cwd;
|
|
2730
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
2731
|
+
}
|
|
2732
|
+
async evaluate(context) {
|
|
2733
|
+
const inputPayload = JSON.stringify(
|
|
2734
|
+
{
|
|
2735
|
+
task: context.evalCase.task,
|
|
2736
|
+
outcome: context.evalCase.outcome,
|
|
2737
|
+
expected: context.evalCase.expected_assistant_raw,
|
|
2738
|
+
output: context.candidate,
|
|
2739
|
+
system_message: context.promptInputs.systemMessage ?? "",
|
|
2740
|
+
guideline_paths: context.evalCase.guideline_paths,
|
|
2741
|
+
attachments: context.evalCase.file_paths,
|
|
2742
|
+
user_segments: context.evalCase.user_segments
|
|
2743
|
+
},
|
|
2744
|
+
null,
|
|
2745
|
+
2
|
|
2746
|
+
);
|
|
2747
|
+
try {
|
|
2748
|
+
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
2749
|
+
const parsed = parseJsonSafe(stdout);
|
|
2750
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
2751
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
2752
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
2753
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
2754
|
+
return {
|
|
2755
|
+
score,
|
|
2756
|
+
hits,
|
|
2757
|
+
misses,
|
|
2758
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
2759
|
+
reasoning,
|
|
2760
|
+
evaluatorRawRequest: {
|
|
2761
|
+
script: this.script,
|
|
2762
|
+
...this.cwd ? { cwd: this.cwd } : {}
|
|
2763
|
+
}
|
|
2764
|
+
};
|
|
2765
|
+
} catch (error) {
|
|
2766
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2767
|
+
return {
|
|
2768
|
+
score: 0,
|
|
2769
|
+
hits: [],
|
|
2770
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
2771
|
+
expectedAspectCount: 1,
|
|
2772
|
+
reasoning: message,
|
|
2773
|
+
evaluatorRawRequest: {
|
|
2774
|
+
script: this.script,
|
|
2775
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
2776
|
+
error: message
|
|
2777
|
+
}
|
|
2778
|
+
};
|
|
2779
|
+
}
|
|
2780
|
+
}
|
|
2781
|
+
};
|
|
2782
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
2783
|
+
const { spawn: spawn2 } = await import("child_process");
|
|
2784
|
+
return await new Promise((resolve, reject) => {
|
|
2785
|
+
const child = spawn2(scriptPath, {
|
|
2786
|
+
shell: true,
|
|
2787
|
+
cwd
|
|
2788
|
+
});
|
|
2789
|
+
let stdout = "";
|
|
2790
|
+
let stderr = "";
|
|
2791
|
+
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
2792
|
+
child.kill();
|
|
2793
|
+
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
2794
|
+
}, agentTimeoutMs) : void 0;
|
|
2795
|
+
child.stdout?.on("data", (data) => {
|
|
2796
|
+
stdout += data.toString();
|
|
2797
|
+
});
|
|
2798
|
+
child.stderr?.on("data", (data) => {
|
|
2799
|
+
stderr += data.toString();
|
|
2800
|
+
});
|
|
2801
|
+
child.on("error", (error) => {
|
|
2802
|
+
if (timeout !== void 0) {
|
|
2803
|
+
clearTimeout(timeout);
|
|
2804
|
+
}
|
|
2805
|
+
reject(error);
|
|
2806
|
+
});
|
|
2807
|
+
child.on("exit", (code) => {
|
|
2808
|
+
if (timeout !== void 0) {
|
|
2809
|
+
clearTimeout(timeout);
|
|
2810
|
+
}
|
|
2811
|
+
if (code && code !== 0 && stderr.length > 0) {
|
|
2812
|
+
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
2813
|
+
return;
|
|
2814
|
+
}
|
|
2815
|
+
resolve(stdout.trim());
|
|
2816
|
+
});
|
|
2817
|
+
child.stdin?.write(input);
|
|
2818
|
+
child.stdin?.end();
|
|
2819
|
+
});
|
|
2820
|
+
}
|
|
2821
|
+
function parseJsonSafe(payload) {
|
|
2822
|
+
try {
|
|
2823
|
+
return JSON.parse(payload);
|
|
2824
|
+
} catch {
|
|
2825
|
+
return void 0;
|
|
2826
|
+
}
|
|
2827
|
+
}
|
|
1772
2828
|
|
|
1773
2829
|
// src/evaluation/orchestrator.ts
|
|
1774
2830
|
var import_node_crypto2 = require("crypto");
|
|
1775
|
-
var
|
|
1776
|
-
var
|
|
2831
|
+
var import_promises6 = require("fs/promises");
|
|
2832
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
1777
2833
|
|
|
1778
2834
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
1779
2835
|
var Node = class {
|
|
@@ -1920,7 +2976,7 @@ async function runEvaluation(options) {
|
|
|
1920
2976
|
targets,
|
|
1921
2977
|
env,
|
|
1922
2978
|
providerFactory,
|
|
1923
|
-
|
|
2979
|
+
evaluators,
|
|
1924
2980
|
maxRetries,
|
|
1925
2981
|
agentTimeoutMs,
|
|
1926
2982
|
promptDumpDir,
|
|
@@ -1979,8 +3035,14 @@ async function runEvaluation(options) {
|
|
|
1979
3035
|
}
|
|
1980
3036
|
return getOrCreateProvider(resolvedJudge);
|
|
1981
3037
|
};
|
|
1982
|
-
const
|
|
3038
|
+
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
1983
3039
|
const primaryProvider = getOrCreateProvider(target);
|
|
3040
|
+
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
3041
|
+
if (target.providerBatching && !providerSupportsBatch && verbose) {
|
|
3042
|
+
console.warn(
|
|
3043
|
+
`Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
|
|
3044
|
+
);
|
|
3045
|
+
}
|
|
1984
3046
|
if (onProgress && filteredEvalCases.length > 0) {
|
|
1985
3047
|
for (let i = 0; i < filteredEvalCases.length; i++) {
|
|
1986
3048
|
await onProgress({
|
|
@@ -1990,6 +3052,28 @@ async function runEvaluation(options) {
|
|
|
1990
3052
|
});
|
|
1991
3053
|
}
|
|
1992
3054
|
}
|
|
3055
|
+
if (providerSupportsBatch) {
|
|
3056
|
+
try {
|
|
3057
|
+
return await runBatchEvaluation({
|
|
3058
|
+
evalCases: filteredEvalCases,
|
|
3059
|
+
provider: primaryProvider,
|
|
3060
|
+
target,
|
|
3061
|
+
evaluatorRegistry,
|
|
3062
|
+
promptDumpDir,
|
|
3063
|
+
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
3064
|
+
onProgress,
|
|
3065
|
+
onResult,
|
|
3066
|
+
verbose,
|
|
3067
|
+
resolveJudgeProvider,
|
|
3068
|
+
agentTimeoutMs
|
|
3069
|
+
});
|
|
3070
|
+
} catch (error) {
|
|
3071
|
+
if (verbose) {
|
|
3072
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3073
|
+
console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
|
|
3074
|
+
}
|
|
3075
|
+
}
|
|
3076
|
+
}
|
|
1993
3077
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1994
3078
|
const limit = pLimit(workers);
|
|
1995
3079
|
let nextWorkerId = 1;
|
|
@@ -2012,7 +3096,7 @@ async function runEvaluation(options) {
|
|
|
2012
3096
|
evalCase,
|
|
2013
3097
|
provider: primaryProvider,
|
|
2014
3098
|
target,
|
|
2015
|
-
|
|
3099
|
+
evaluators: evaluatorRegistry,
|
|
2016
3100
|
maxRetries,
|
|
2017
3101
|
agentTimeoutMs,
|
|
2018
3102
|
promptDumpDir,
|
|
@@ -2073,12 +3157,118 @@ async function runEvaluation(options) {
|
|
|
2073
3157
|
}
|
|
2074
3158
|
return results;
|
|
2075
3159
|
}
|
|
3160
|
+
async function runBatchEvaluation(options) {
|
|
3161
|
+
const {
|
|
3162
|
+
evalCases,
|
|
3163
|
+
provider,
|
|
3164
|
+
target,
|
|
3165
|
+
evaluatorRegistry,
|
|
3166
|
+
promptDumpDir,
|
|
3167
|
+
nowFn,
|
|
3168
|
+
onProgress,
|
|
3169
|
+
onResult,
|
|
3170
|
+
resolveJudgeProvider,
|
|
3171
|
+
agentTimeoutMs
|
|
3172
|
+
} = options;
|
|
3173
|
+
const promptInputsList = [];
|
|
3174
|
+
for (const evalCase of evalCases) {
|
|
3175
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
3176
|
+
if (promptDumpDir) {
|
|
3177
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
3178
|
+
}
|
|
3179
|
+
promptInputsList.push(promptInputs);
|
|
3180
|
+
}
|
|
3181
|
+
const batchRequests = evalCases.map((evalCase, index) => {
|
|
3182
|
+
const promptInputs = promptInputsList[index];
|
|
3183
|
+
return {
|
|
3184
|
+
prompt: promptInputs.request,
|
|
3185
|
+
guidelines: promptInputs.guidelines,
|
|
3186
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
3187
|
+
inputFiles: evalCase.file_paths,
|
|
3188
|
+
evalCaseId: evalCase.id,
|
|
3189
|
+
metadata: {
|
|
3190
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
3191
|
+
}
|
|
3192
|
+
};
|
|
3193
|
+
});
|
|
3194
|
+
const batchResponse = await provider.invokeBatch?.(batchRequests);
|
|
3195
|
+
if (!Array.isArray(batchResponse)) {
|
|
3196
|
+
throw new Error("Provider batching failed: invokeBatch did not return an array");
|
|
3197
|
+
}
|
|
3198
|
+
if (batchResponse.length !== evalCases.length) {
|
|
3199
|
+
throw new Error(
|
|
3200
|
+
`Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
|
|
3201
|
+
);
|
|
3202
|
+
}
|
|
3203
|
+
if (onProgress) {
|
|
3204
|
+
const startedAt = Date.now();
|
|
3205
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
3206
|
+
await onProgress({
|
|
3207
|
+
workerId: 1,
|
|
3208
|
+
evalId: evalCases[i].id,
|
|
3209
|
+
status: "running",
|
|
3210
|
+
startedAt
|
|
3211
|
+
});
|
|
3212
|
+
}
|
|
3213
|
+
}
|
|
3214
|
+
const results = [];
|
|
3215
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
3216
|
+
const evalCase = evalCases[i];
|
|
3217
|
+
const promptInputs = promptInputsList[i];
|
|
3218
|
+
const providerResponse = batchResponse[i];
|
|
3219
|
+
let result;
|
|
3220
|
+
try {
|
|
3221
|
+
result = await evaluateCandidate({
|
|
3222
|
+
evalCase,
|
|
3223
|
+
candidate: providerResponse.text ?? "",
|
|
3224
|
+
target,
|
|
3225
|
+
provider,
|
|
3226
|
+
evaluators: evaluatorRegistry,
|
|
3227
|
+
promptInputs,
|
|
3228
|
+
nowFn,
|
|
3229
|
+
attempt: 0,
|
|
3230
|
+
judgeProvider: await resolveJudgeProvider(target),
|
|
3231
|
+
agentTimeoutMs
|
|
3232
|
+
});
|
|
3233
|
+
} catch (error) {
|
|
3234
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3235
|
+
results.push(errorResult);
|
|
3236
|
+
if (onResult) {
|
|
3237
|
+
await onResult(errorResult);
|
|
3238
|
+
}
|
|
3239
|
+
if (onProgress) {
|
|
3240
|
+
await onProgress({
|
|
3241
|
+
workerId: 1,
|
|
3242
|
+
evalId: evalCase.id,
|
|
3243
|
+
status: "failed",
|
|
3244
|
+
completedAt: Date.now(),
|
|
3245
|
+
error: error instanceof Error ? error.message : String(error)
|
|
3246
|
+
});
|
|
3247
|
+
}
|
|
3248
|
+
continue;
|
|
3249
|
+
}
|
|
3250
|
+
results.push(result);
|
|
3251
|
+
if (onResult) {
|
|
3252
|
+
await onResult(result);
|
|
3253
|
+
}
|
|
3254
|
+
if (onProgress) {
|
|
3255
|
+
await onProgress({
|
|
3256
|
+
workerId: 1,
|
|
3257
|
+
evalId: evalCase.id,
|
|
3258
|
+
status: "completed",
|
|
3259
|
+
startedAt: 0,
|
|
3260
|
+
completedAt: Date.now()
|
|
3261
|
+
});
|
|
3262
|
+
}
|
|
3263
|
+
}
|
|
3264
|
+
return results;
|
|
3265
|
+
}
|
|
2076
3266
|
async function runEvalCase(options) {
|
|
2077
3267
|
const {
|
|
2078
3268
|
evalCase,
|
|
2079
3269
|
provider,
|
|
2080
3270
|
target,
|
|
2081
|
-
|
|
3271
|
+
evaluators,
|
|
2082
3272
|
now,
|
|
2083
3273
|
maxRetries,
|
|
2084
3274
|
agentTimeoutMs,
|
|
@@ -2133,27 +3323,49 @@ async function runEvalCase(options) {
|
|
|
2133
3323
|
if (cacheKey && cache && !cachedResponse) {
|
|
2134
3324
|
await cache.set(cacheKey, providerResponse);
|
|
2135
3325
|
}
|
|
2136
|
-
const graderKind = evalCase.grader ?? "heuristic";
|
|
2137
|
-
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
2138
|
-
if (!activeGrader) {
|
|
2139
|
-
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2140
|
-
}
|
|
2141
|
-
let grade;
|
|
2142
3326
|
try {
|
|
2143
|
-
|
|
2144
|
-
grade = await activeGrader.grade({
|
|
3327
|
+
return await evaluateCandidate({
|
|
2145
3328
|
evalCase,
|
|
2146
3329
|
candidate: providerResponse.text ?? "",
|
|
2147
3330
|
target,
|
|
2148
3331
|
provider,
|
|
2149
|
-
|
|
3332
|
+
evaluators,
|
|
2150
3333
|
promptInputs,
|
|
2151
|
-
|
|
2152
|
-
|
|
3334
|
+
nowFn,
|
|
3335
|
+
attempt,
|
|
3336
|
+
judgeProvider,
|
|
3337
|
+
agentTimeoutMs
|
|
2153
3338
|
});
|
|
2154
3339
|
} catch (error) {
|
|
2155
3340
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2156
3341
|
}
|
|
3342
|
+
}
|
|
3343
|
+
async function evaluateCandidate(options) {
|
|
3344
|
+
const {
|
|
3345
|
+
evalCase,
|
|
3346
|
+
candidate,
|
|
3347
|
+
target,
|
|
3348
|
+
provider,
|
|
3349
|
+
evaluators,
|
|
3350
|
+
promptInputs,
|
|
3351
|
+
nowFn,
|
|
3352
|
+
attempt,
|
|
3353
|
+
judgeProvider,
|
|
3354
|
+
agentTimeoutMs
|
|
3355
|
+
} = options;
|
|
3356
|
+
const gradeTimestamp = nowFn();
|
|
3357
|
+
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
3358
|
+
evalCase,
|
|
3359
|
+
candidate,
|
|
3360
|
+
target,
|
|
3361
|
+
provider,
|
|
3362
|
+
evaluators,
|
|
3363
|
+
attempt,
|
|
3364
|
+
promptInputs,
|
|
3365
|
+
now: gradeTimestamp,
|
|
3366
|
+
judgeProvider,
|
|
3367
|
+
agentTimeoutMs
|
|
3368
|
+
});
|
|
2157
3369
|
const completedAt = nowFn();
|
|
2158
3370
|
const rawRequest = {
|
|
2159
3371
|
request: promptInputs.request,
|
|
@@ -2164,28 +3376,200 @@ async function runEvalCase(options) {
|
|
|
2164
3376
|
return {
|
|
2165
3377
|
eval_id: evalCase.id,
|
|
2166
3378
|
conversation_id: evalCase.conversation_id,
|
|
2167
|
-
score:
|
|
2168
|
-
hits:
|
|
2169
|
-
misses:
|
|
2170
|
-
model_answer:
|
|
2171
|
-
expected_aspect_count:
|
|
3379
|
+
score: score.score,
|
|
3380
|
+
hits: score.hits,
|
|
3381
|
+
misses: score.misses,
|
|
3382
|
+
model_answer: candidate,
|
|
3383
|
+
expected_aspect_count: score.expectedAspectCount,
|
|
2172
3384
|
target: target.name,
|
|
2173
3385
|
timestamp: completedAt.toISOString(),
|
|
2174
|
-
reasoning:
|
|
2175
|
-
raw_aspects:
|
|
3386
|
+
reasoning: score.reasoning,
|
|
3387
|
+
raw_aspects: score.rawAspects,
|
|
2176
3388
|
raw_request: rawRequest,
|
|
2177
|
-
|
|
3389
|
+
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3390
|
+
evaluator_results: evaluatorResults
|
|
2178
3391
|
};
|
|
2179
3392
|
}
|
|
3393
|
+
async function runEvaluatorsForCase(options) {
|
|
3394
|
+
const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
|
|
3395
|
+
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
3396
|
+
return runEvaluatorList({
|
|
3397
|
+
evalCase,
|
|
3398
|
+
evaluators: evalCase.evaluators,
|
|
3399
|
+
candidate,
|
|
3400
|
+
target,
|
|
3401
|
+
provider,
|
|
3402
|
+
evaluatorRegistry: evaluators,
|
|
3403
|
+
attempt,
|
|
3404
|
+
promptInputs,
|
|
3405
|
+
now,
|
|
3406
|
+
judgeProvider,
|
|
3407
|
+
agentTimeoutMs
|
|
3408
|
+
});
|
|
3409
|
+
}
|
|
3410
|
+
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
3411
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
|
|
3412
|
+
if (!activeEvaluator) {
|
|
3413
|
+
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
3414
|
+
}
|
|
3415
|
+
const score = await activeEvaluator.evaluate({
|
|
3416
|
+
evalCase,
|
|
3417
|
+
candidate,
|
|
3418
|
+
target,
|
|
3419
|
+
provider,
|
|
3420
|
+
attempt,
|
|
3421
|
+
promptInputs,
|
|
3422
|
+
now,
|
|
3423
|
+
judgeProvider
|
|
3424
|
+
});
|
|
3425
|
+
return { score };
|
|
3426
|
+
}
|
|
3427
|
+
async function runEvaluatorList(options) {
|
|
3428
|
+
const {
|
|
3429
|
+
evalCase,
|
|
3430
|
+
evaluators,
|
|
3431
|
+
candidate,
|
|
3432
|
+
target,
|
|
3433
|
+
provider,
|
|
3434
|
+
evaluatorRegistry,
|
|
3435
|
+
attempt,
|
|
3436
|
+
promptInputs,
|
|
3437
|
+
now,
|
|
3438
|
+
judgeProvider,
|
|
3439
|
+
agentTimeoutMs
|
|
3440
|
+
} = options;
|
|
3441
|
+
const scored = [];
|
|
3442
|
+
const evaluatorResults = [];
|
|
3443
|
+
for (const evaluator of evaluators ?? []) {
|
|
3444
|
+
try {
|
|
3445
|
+
if (evaluator.type === "llm_judge") {
|
|
3446
|
+
const score2 = await runLlmJudgeEvaluator({
|
|
3447
|
+
config: evaluator,
|
|
3448
|
+
evalCase,
|
|
3449
|
+
candidate,
|
|
3450
|
+
target,
|
|
3451
|
+
provider,
|
|
3452
|
+
evaluatorRegistry,
|
|
3453
|
+
attempt,
|
|
3454
|
+
promptInputs,
|
|
3455
|
+
now,
|
|
3456
|
+
judgeProvider
|
|
3457
|
+
});
|
|
3458
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3459
|
+
evaluatorResults.push({
|
|
3460
|
+
name: evaluator.name,
|
|
3461
|
+
type: evaluator.type,
|
|
3462
|
+
score: score2.score,
|
|
3463
|
+
hits: score2.hits,
|
|
3464
|
+
misses: score2.misses,
|
|
3465
|
+
reasoning: score2.reasoning,
|
|
3466
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3467
|
+
});
|
|
3468
|
+
continue;
|
|
3469
|
+
}
|
|
3470
|
+
if (evaluator.type === "code") {
|
|
3471
|
+
const codeEvaluator = new CodeEvaluator({
|
|
3472
|
+
script: evaluator.script,
|
|
3473
|
+
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
3474
|
+
agentTimeoutMs
|
|
3475
|
+
});
|
|
3476
|
+
const score2 = await codeEvaluator.evaluate({
|
|
3477
|
+
evalCase,
|
|
3478
|
+
candidate,
|
|
3479
|
+
target,
|
|
3480
|
+
provider,
|
|
3481
|
+
attempt,
|
|
3482
|
+
promptInputs,
|
|
3483
|
+
now
|
|
3484
|
+
});
|
|
3485
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3486
|
+
evaluatorResults.push({
|
|
3487
|
+
name: evaluator.name,
|
|
3488
|
+
type: evaluator.type,
|
|
3489
|
+
score: score2.score,
|
|
3490
|
+
hits: score2.hits,
|
|
3491
|
+
misses: score2.misses,
|
|
3492
|
+
reasoning: score2.reasoning,
|
|
3493
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3494
|
+
});
|
|
3495
|
+
continue;
|
|
3496
|
+
}
|
|
3497
|
+
} catch (error) {
|
|
3498
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3499
|
+
const fallbackScore = {
|
|
3500
|
+
score: 0,
|
|
3501
|
+
hits: [],
|
|
3502
|
+
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
3503
|
+
expectedAspectCount: 1,
|
|
3504
|
+
reasoning: message
|
|
3505
|
+
};
|
|
3506
|
+
scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
|
|
3507
|
+
evaluatorResults.push({
|
|
3508
|
+
name: evaluator.name ?? "unknown",
|
|
3509
|
+
type: evaluator.type ?? "unknown",
|
|
3510
|
+
score: 0,
|
|
3511
|
+
hits: [],
|
|
3512
|
+
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
3513
|
+
reasoning: message
|
|
3514
|
+
});
|
|
3515
|
+
}
|
|
3516
|
+
}
|
|
3517
|
+
const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
|
|
3518
|
+
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
3519
|
+
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
3520
|
+
const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
|
|
3521
|
+
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
3522
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
3523
|
+
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
3524
|
+
const score = {
|
|
3525
|
+
score: aggregateScore,
|
|
3526
|
+
hits,
|
|
3527
|
+
misses,
|
|
3528
|
+
expectedAspectCount,
|
|
3529
|
+
reasoning,
|
|
3530
|
+
rawAspects: rawAspects.length > 0 ? rawAspects : void 0
|
|
3531
|
+
};
|
|
3532
|
+
return { score, evaluatorResults };
|
|
3533
|
+
}
|
|
3534
|
+
async function runLlmJudgeEvaluator(options) {
|
|
3535
|
+
const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
|
|
3536
|
+
const customPrompt = await resolveCustomPrompt(config);
|
|
3537
|
+
return evaluatorRegistry.llm_judge.evaluate({
|
|
3538
|
+
evalCase,
|
|
3539
|
+
candidate,
|
|
3540
|
+
target,
|
|
3541
|
+
provider,
|
|
3542
|
+
attempt,
|
|
3543
|
+
promptInputs,
|
|
3544
|
+
now,
|
|
3545
|
+
judgeProvider,
|
|
3546
|
+
systemPrompt: customPrompt,
|
|
3547
|
+
evaluator: config,
|
|
3548
|
+
judgeModel: config.model
|
|
3549
|
+
});
|
|
3550
|
+
}
|
|
3551
|
+
async function resolveCustomPrompt(config) {
|
|
3552
|
+
if (config.promptPath) {
|
|
3553
|
+
try {
|
|
3554
|
+
return await (0, import_promises6.readFile)(config.promptPath, "utf8");
|
|
3555
|
+
} catch (error) {
|
|
3556
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3557
|
+
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
3558
|
+
}
|
|
3559
|
+
}
|
|
3560
|
+
return config.prompt;
|
|
3561
|
+
}
|
|
3562
|
+
function isNonEmptyString2(value) {
|
|
3563
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
3564
|
+
}
|
|
2180
3565
|
function filterEvalCases(evalCases, evalId) {
|
|
2181
3566
|
if (!evalId) {
|
|
2182
3567
|
return evalCases;
|
|
2183
3568
|
}
|
|
2184
3569
|
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
2185
3570
|
}
|
|
2186
|
-
function
|
|
2187
|
-
const
|
|
2188
|
-
const llmJudge = overrides?.llm_judge ?? new QualityGrader({
|
|
3571
|
+
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
3572
|
+
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
2189
3573
|
resolveJudgeProvider: async (context) => {
|
|
2190
3574
|
if (context.judgeProvider) {
|
|
2191
3575
|
return context.judgeProvider;
|
|
@@ -2195,22 +3579,21 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
2195
3579
|
});
|
|
2196
3580
|
return {
|
|
2197
3581
|
...overrides,
|
|
2198
|
-
heuristic,
|
|
2199
3582
|
llm_judge: llmJudge
|
|
2200
3583
|
};
|
|
2201
3584
|
}
|
|
2202
3585
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
2203
3586
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2204
3587
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
2205
|
-
const filePath =
|
|
2206
|
-
await (0,
|
|
3588
|
+
const filePath = import_node_path8.default.resolve(directory, filename);
|
|
3589
|
+
await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
|
|
2207
3590
|
const payload = {
|
|
2208
3591
|
eval_id: evalCase.id,
|
|
2209
3592
|
request: promptInputs.request,
|
|
2210
3593
|
guidelines: promptInputs.guidelines,
|
|
2211
3594
|
guideline_paths: evalCase.guideline_paths
|
|
2212
3595
|
};
|
|
2213
|
-
await (0,
|
|
3596
|
+
await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
2214
3597
|
}
|
|
2215
3598
|
function sanitizeFilename(value) {
|
|
2216
3599
|
if (!value) {
|
|
@@ -2220,7 +3603,7 @@ function sanitizeFilename(value) {
|
|
|
2220
3603
|
return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
|
|
2221
3604
|
}
|
|
2222
3605
|
async function invokeProvider(provider, options) {
|
|
2223
|
-
const { evalCase,
|
|
3606
|
+
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2224
3607
|
const controller = new AbortController();
|
|
2225
3608
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2226
3609
|
if (signal) {
|
|
@@ -2231,7 +3614,7 @@ async function invokeProvider(provider, options) {
|
|
|
2231
3614
|
prompt: promptInputs.request,
|
|
2232
3615
|
guidelines: promptInputs.guidelines,
|
|
2233
3616
|
guideline_patterns: evalCase.guideline_patterns,
|
|
2234
|
-
|
|
3617
|
+
inputFiles: evalCase.file_paths,
|
|
2235
3618
|
evalCaseId: evalCase.id,
|
|
2236
3619
|
attempt,
|
|
2237
3620
|
metadata: {
|
|
@@ -2300,25 +3683,20 @@ function createAgentKernel() {
|
|
|
2300
3683
|
}
|
|
2301
3684
|
// Annotate the CommonJS export names for ESM import in node:
|
|
2302
3685
|
0 && (module.exports = {
|
|
2303
|
-
|
|
2304
|
-
|
|
2305
|
-
QualityGrader,
|
|
3686
|
+
CodeEvaluator,
|
|
3687
|
+
LlmJudgeEvaluator,
|
|
2306
3688
|
TEST_MESSAGE_ROLES,
|
|
2307
3689
|
buildDirectoryChain,
|
|
2308
3690
|
buildPromptInputs,
|
|
2309
3691
|
buildSearchRoots,
|
|
2310
|
-
calculateHits,
|
|
2311
|
-
calculateMisses,
|
|
2312
3692
|
createAgentKernel,
|
|
2313
3693
|
createProvider,
|
|
2314
3694
|
ensureVSCodeSubagents,
|
|
2315
|
-
extractAspects,
|
|
2316
3695
|
extractCodeBlocks,
|
|
2317
3696
|
fileExists,
|
|
2318
3697
|
findGitRoot,
|
|
2319
3698
|
getHitCount,
|
|
2320
|
-
|
|
2321
|
-
isGraderKind,
|
|
3699
|
+
isEvaluatorKind,
|
|
2322
3700
|
isGuidelineFile,
|
|
2323
3701
|
isJsonObject,
|
|
2324
3702
|
isJsonValue,
|
|
@@ -2331,7 +3709,6 @@ function createAgentKernel() {
|
|
|
2331
3709
|
resolveFileReference,
|
|
2332
3710
|
resolveTargetDefinition,
|
|
2333
3711
|
runEvalCase,
|
|
2334
|
-
runEvaluation
|
|
2335
|
-
scoreCandidateResponse
|
|
3712
|
+
runEvaluation
|
|
2336
3713
|
});
|
|
2337
3714
|
//# sourceMappingURL=index.cjs.map
|