@agentv/core 0.2.11 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-P4GOYWYH.js → chunk-NL7K4CAK.js} +5 -1
- package/dist/chunk-NL7K4CAK.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +186 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +183 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1482 -359
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +107 -63
- package/dist/index.d.ts +107 -63
- package/dist/index.js +1474 -350
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-P4GOYWYH.js.map +0 -1
- package/dist/chunk-XXNQA4EW.js +0 -140
- package/dist/chunk-XXNQA4EW.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -30,25 +30,20 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
QualityGrader: () => QualityGrader,
|
|
33
|
+
CodeEvaluator: () => CodeEvaluator,
|
|
34
|
+
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
36
35
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
36
|
buildDirectoryChain: () => buildDirectoryChain,
|
|
38
37
|
buildPromptInputs: () => buildPromptInputs,
|
|
39
38
|
buildSearchRoots: () => buildSearchRoots,
|
|
40
|
-
calculateHits: () => calculateHits,
|
|
41
|
-
calculateMisses: () => calculateMisses,
|
|
42
39
|
createAgentKernel: () => createAgentKernel,
|
|
43
40
|
createProvider: () => createProvider,
|
|
44
41
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
45
|
-
extractAspects: () => extractAspects,
|
|
46
42
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
47
43
|
fileExists: () => fileExists,
|
|
48
44
|
findGitRoot: () => findGitRoot,
|
|
49
45
|
getHitCount: () => getHitCount,
|
|
50
|
-
|
|
51
|
-
isGraderKind: () => isGraderKind,
|
|
46
|
+
isEvaluatorKind: () => isEvaluatorKind,
|
|
52
47
|
isGuidelineFile: () => isGuidelineFile,
|
|
53
48
|
isJsonObject: () => isJsonObject,
|
|
54
49
|
isJsonValue: () => isJsonValue,
|
|
@@ -61,8 +56,7 @@ __export(index_exports, {
|
|
|
61
56
|
resolveFileReference: () => resolveFileReference,
|
|
62
57
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
63
58
|
runEvalCase: () => runEvalCase,
|
|
64
|
-
runEvaluation: () => runEvaluation
|
|
65
|
-
scoreCandidateResponse: () => scoreCandidateResponse
|
|
59
|
+
runEvaluation: () => runEvaluation
|
|
66
60
|
});
|
|
67
61
|
module.exports = __toCommonJS(index_exports);
|
|
68
62
|
|
|
@@ -107,11 +101,10 @@ function isTestMessage(value) {
|
|
|
107
101
|
}
|
|
108
102
|
return candidate.content.every(isJsonObject);
|
|
109
103
|
}
|
|
110
|
-
var
|
|
111
|
-
var
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
return typeof value === "string" && GRADER_KIND_SET.has(value);
|
|
104
|
+
var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
|
|
105
|
+
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
106
|
+
function isEvaluatorKind(value) {
|
|
107
|
+
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
115
108
|
}
|
|
116
109
|
function getHitCount(result) {
|
|
117
110
|
return result.hits.length;
|
|
@@ -325,7 +318,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
325
318
|
if (!Array.isArray(rawTestcases)) {
|
|
326
319
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
327
320
|
}
|
|
328
|
-
const
|
|
321
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
329
322
|
const results = [];
|
|
330
323
|
for (const rawEvalcase of rawTestcases) {
|
|
331
324
|
if (!isJsonObject(rawEvalcase)) {
|
|
@@ -448,7 +441,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
448
441
|
const assistantContent = assistantMessages[0]?.content;
|
|
449
442
|
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
450
443
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
451
|
-
const
|
|
444
|
+
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
445
|
+
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
452
446
|
const userFilePaths = [];
|
|
453
447
|
for (const segment of userSegments) {
|
|
454
448
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -471,7 +465,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
471
465
|
file_paths: allFilePaths,
|
|
472
466
|
code_snippets: codeSnippets,
|
|
473
467
|
outcome,
|
|
474
|
-
|
|
468
|
+
evaluator: testCaseEvaluatorKind,
|
|
469
|
+
evaluators
|
|
475
470
|
};
|
|
476
471
|
if (verbose) {
|
|
477
472
|
console.log(`
|
|
@@ -632,14 +627,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
632
627
|
}
|
|
633
628
|
return parts.join(" ");
|
|
634
629
|
}
|
|
635
|
-
function
|
|
630
|
+
async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
631
|
+
const execution = rawEvalCase.execution;
|
|
632
|
+
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
|
|
633
|
+
if (candidateEvaluators === void 0) {
|
|
634
|
+
return void 0;
|
|
635
|
+
}
|
|
636
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
637
|
+
logWarning(`Skipping evaluators for '${evalId}': expected array`);
|
|
638
|
+
return void 0;
|
|
639
|
+
}
|
|
640
|
+
const evaluators = [];
|
|
641
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
642
|
+
if (!isJsonObject(rawEvaluator)) {
|
|
643
|
+
logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
644
|
+
continue;
|
|
645
|
+
}
|
|
646
|
+
const name = asString(rawEvaluator.name);
|
|
647
|
+
const typeValue = rawEvaluator.type;
|
|
648
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
649
|
+
logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
650
|
+
continue;
|
|
651
|
+
}
|
|
652
|
+
if (typeValue === "code") {
|
|
653
|
+
const script = asString(rawEvaluator.script);
|
|
654
|
+
if (!script) {
|
|
655
|
+
logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
656
|
+
continue;
|
|
657
|
+
}
|
|
658
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
659
|
+
let resolvedCwd;
|
|
660
|
+
if (cwd) {
|
|
661
|
+
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
662
|
+
if (resolved.resolvedPath) {
|
|
663
|
+
resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
664
|
+
} else {
|
|
665
|
+
logWarning(
|
|
666
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
667
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
668
|
+
);
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
evaluators.push({
|
|
672
|
+
name,
|
|
673
|
+
type: "code",
|
|
674
|
+
script,
|
|
675
|
+
cwd,
|
|
676
|
+
resolvedCwd
|
|
677
|
+
});
|
|
678
|
+
continue;
|
|
679
|
+
}
|
|
680
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
681
|
+
let promptPath;
|
|
682
|
+
if (prompt) {
|
|
683
|
+
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
684
|
+
if (resolved.resolvedPath) {
|
|
685
|
+
promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
686
|
+
} else {
|
|
687
|
+
logWarning(
|
|
688
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
689
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
690
|
+
);
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
const model = asString(rawEvaluator.model);
|
|
694
|
+
evaluators.push({
|
|
695
|
+
name,
|
|
696
|
+
type: "llm_judge",
|
|
697
|
+
prompt,
|
|
698
|
+
promptPath,
|
|
699
|
+
model
|
|
700
|
+
});
|
|
701
|
+
}
|
|
702
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
703
|
+
}
|
|
704
|
+
function coerceEvaluator(candidate, contextId) {
|
|
636
705
|
if (typeof candidate !== "string") {
|
|
637
706
|
return void 0;
|
|
638
707
|
}
|
|
639
|
-
if (
|
|
708
|
+
if (isEvaluatorKind(candidate)) {
|
|
640
709
|
return candidate;
|
|
641
710
|
}
|
|
642
|
-
logWarning(`Unknown
|
|
711
|
+
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
643
712
|
return void 0;
|
|
644
713
|
}
|
|
645
714
|
function logWarning(message, details) {
|
|
@@ -835,6 +904,790 @@ var GeminiProvider = class {
|
|
|
835
904
|
}
|
|
836
905
|
};
|
|
837
906
|
|
|
907
|
+
// src/evaluation/providers/cli.ts
|
|
908
|
+
var import_node_child_process = require("child_process");
|
|
909
|
+
var import_node_path3 = __toESM(require("path"), 1);
|
|
910
|
+
var import_node_util = require("util");
|
|
911
|
+
var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
|
|
912
|
+
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
913
|
+
async function defaultCommandRunner(command, options) {
|
|
914
|
+
const execOptions = {
|
|
915
|
+
cwd: options.cwd,
|
|
916
|
+
env: options.env,
|
|
917
|
+
timeout: options.timeoutMs,
|
|
918
|
+
signal: options.signal,
|
|
919
|
+
maxBuffer: DEFAULT_MAX_BUFFER,
|
|
920
|
+
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
921
|
+
};
|
|
922
|
+
try {
|
|
923
|
+
const { stdout, stderr } = await execAsync(command, execOptions);
|
|
924
|
+
return {
|
|
925
|
+
stdout,
|
|
926
|
+
stderr,
|
|
927
|
+
exitCode: 0,
|
|
928
|
+
failed: false,
|
|
929
|
+
timedOut: false,
|
|
930
|
+
signal: null
|
|
931
|
+
};
|
|
932
|
+
} catch (error) {
|
|
933
|
+
const execError = error;
|
|
934
|
+
return {
|
|
935
|
+
stdout: execError.stdout ?? "",
|
|
936
|
+
stderr: execError.stderr ?? "",
|
|
937
|
+
exitCode: typeof execError.code === "number" ? execError.code : null,
|
|
938
|
+
failed: true,
|
|
939
|
+
timedOut: execError.timedOut === true || execError.killed === true,
|
|
940
|
+
signal: execError.signal ?? null
|
|
941
|
+
};
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
var CliProvider = class {
|
|
945
|
+
id;
|
|
946
|
+
kind = "cli";
|
|
947
|
+
targetName;
|
|
948
|
+
supportsBatch = false;
|
|
949
|
+
config;
|
|
950
|
+
runCommand;
|
|
951
|
+
healthcheckPromise;
|
|
952
|
+
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
953
|
+
this.targetName = targetName;
|
|
954
|
+
this.id = `cli:${targetName}`;
|
|
955
|
+
this.config = config;
|
|
956
|
+
this.runCommand = runner;
|
|
957
|
+
}
|
|
958
|
+
async invoke(request) {
|
|
959
|
+
if (request.signal?.aborted) {
|
|
960
|
+
throw new Error("CLI provider request was aborted before execution");
|
|
961
|
+
}
|
|
962
|
+
await this.ensureHealthy(request.signal);
|
|
963
|
+
const templateValues = buildTemplateValues(request, this.config);
|
|
964
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
965
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
966
|
+
const result = await this.runCommand(renderedCommand, {
|
|
967
|
+
cwd: this.config.cwd,
|
|
968
|
+
env,
|
|
969
|
+
timeoutMs: this.config.timeoutMs,
|
|
970
|
+
signal: request.signal
|
|
971
|
+
});
|
|
972
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
973
|
+
if (request.signal?.aborted) {
|
|
974
|
+
throw new Error("CLI provider request was aborted");
|
|
975
|
+
}
|
|
976
|
+
if (result.timedOut) {
|
|
977
|
+
throw new Error(
|
|
978
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
979
|
+
);
|
|
980
|
+
}
|
|
981
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
982
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
983
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
984
|
+
throw new Error(message);
|
|
985
|
+
}
|
|
986
|
+
return {
|
|
987
|
+
text: result.stdout,
|
|
988
|
+
raw: {
|
|
989
|
+
command: renderedCommand,
|
|
990
|
+
stderr: result.stderr,
|
|
991
|
+
exitCode: result.exitCode ?? 0,
|
|
992
|
+
cwd: this.config.cwd
|
|
993
|
+
}
|
|
994
|
+
};
|
|
995
|
+
}
|
|
996
|
+
async ensureHealthy(signal) {
|
|
997
|
+
if (!this.config.healthcheck) {
|
|
998
|
+
return;
|
|
999
|
+
}
|
|
1000
|
+
if (!this.healthcheckPromise) {
|
|
1001
|
+
this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
|
|
1002
|
+
}
|
|
1003
|
+
return this.healthcheckPromise;
|
|
1004
|
+
}
|
|
1005
|
+
async runHealthcheck(healthcheck, signal) {
|
|
1006
|
+
if (!healthcheck) {
|
|
1007
|
+
return;
|
|
1008
|
+
}
|
|
1009
|
+
const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
|
|
1010
|
+
if (healthcheck.type === "http") {
|
|
1011
|
+
const controller = new AbortController();
|
|
1012
|
+
const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
|
|
1013
|
+
signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1014
|
+
try {
|
|
1015
|
+
const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
|
|
1016
|
+
if (!response.ok) {
|
|
1017
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}`);
|
|
1018
|
+
}
|
|
1019
|
+
} catch (error) {
|
|
1020
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
1021
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
|
|
1022
|
+
} finally {
|
|
1023
|
+
if (timer !== void 0) {
|
|
1024
|
+
clearTimeout(timer);
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
return;
|
|
1028
|
+
}
|
|
1029
|
+
const renderedCommand = renderTemplate(
|
|
1030
|
+
healthcheck.commandTemplate,
|
|
1031
|
+
buildTemplateValues(
|
|
1032
|
+
{
|
|
1033
|
+
prompt: "",
|
|
1034
|
+
guidelines: "",
|
|
1035
|
+
inputFiles: [],
|
|
1036
|
+
evalCaseId: "",
|
|
1037
|
+
attempt: 0
|
|
1038
|
+
},
|
|
1039
|
+
this.config
|
|
1040
|
+
)
|
|
1041
|
+
);
|
|
1042
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
1043
|
+
const result = await this.runCommand(renderedCommand, {
|
|
1044
|
+
cwd: healthcheck.cwd ?? this.config.cwd,
|
|
1045
|
+
env,
|
|
1046
|
+
timeoutMs,
|
|
1047
|
+
signal
|
|
1048
|
+
});
|
|
1049
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
1050
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
1051
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
1052
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
|
|
1053
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
};
|
|
1057
|
+
function buildTemplateValues(request, config) {
|
|
1058
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
1059
|
+
return {
|
|
1060
|
+
PROMPT: shellEscape(request.prompt ?? ""),
|
|
1061
|
+
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
1062
|
+
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
1063
|
+
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
1064
|
+
FILES: formatFileList(inputFiles, config.filesFormat)
|
|
1065
|
+
};
|
|
1066
|
+
}
|
|
1067
|
+
function normalizeInputFiles(inputFiles) {
|
|
1068
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1069
|
+
return void 0;
|
|
1070
|
+
}
|
|
1071
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1072
|
+
for (const inputFile of inputFiles) {
|
|
1073
|
+
const absolutePath = import_node_path3.default.resolve(inputFile);
|
|
1074
|
+
if (!unique.has(absolutePath)) {
|
|
1075
|
+
unique.set(absolutePath, absolutePath);
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
return Array.from(unique.values());
|
|
1079
|
+
}
|
|
1080
|
+
function formatFileList(files, template) {
|
|
1081
|
+
if (!files || files.length === 0) {
|
|
1082
|
+
return "";
|
|
1083
|
+
}
|
|
1084
|
+
const formatter = template ?? "{path}";
|
|
1085
|
+
return files.map((filePath) => {
|
|
1086
|
+
const escapedPath = shellEscape(filePath);
|
|
1087
|
+
const escapedName = shellEscape(import_node_path3.default.basename(filePath));
|
|
1088
|
+
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
1089
|
+
}).join(" ");
|
|
1090
|
+
}
|
|
1091
|
+
function renderTemplate(template, values) {
|
|
1092
|
+
return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
|
|
1093
|
+
const replacement = values[key];
|
|
1094
|
+
return replacement !== void 0 ? replacement : match;
|
|
1095
|
+
});
|
|
1096
|
+
}
|
|
1097
|
+
function shellEscape(value) {
|
|
1098
|
+
if (value.length === 0) {
|
|
1099
|
+
return "''";
|
|
1100
|
+
}
|
|
1101
|
+
if (process.platform === "win32") {
|
|
1102
|
+
const escaped = value.replace(/"/g, '\\"');
|
|
1103
|
+
return `"${escaped}"`;
|
|
1104
|
+
}
|
|
1105
|
+
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
1106
|
+
}
|
|
1107
|
+
function formatTimeoutSuffix(timeoutMs) {
|
|
1108
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
1109
|
+
return "";
|
|
1110
|
+
}
|
|
1111
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
1112
|
+
return ` after ${seconds}s`;
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
// src/evaluation/providers/codex.ts
|
|
1116
|
+
var import_node_child_process2 = require("child_process");
|
|
1117
|
+
var import_node_fs3 = require("fs");
|
|
1118
|
+
var import_promises3 = require("fs/promises");
|
|
1119
|
+
var import_node_os = require("os");
|
|
1120
|
+
var import_node_path5 = __toESM(require("path"), 1);
|
|
1121
|
+
var import_node_util2 = require("util");
|
|
1122
|
+
|
|
1123
|
+
// src/evaluation/providers/preread.ts
|
|
1124
|
+
var import_node_path4 = __toESM(require("path"), 1);
|
|
1125
|
+
function buildPromptDocument(request, inputFiles, options) {
|
|
1126
|
+
const parts = [];
|
|
1127
|
+
const guidelineFiles = collectGuidelineFiles(
|
|
1128
|
+
inputFiles,
|
|
1129
|
+
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
1130
|
+
options?.guidelineOverrides
|
|
1131
|
+
);
|
|
1132
|
+
const inputFilesList = collectInputFiles(inputFiles);
|
|
1133
|
+
const nonGuidelineInputFiles = inputFilesList.filter(
|
|
1134
|
+
(file) => !guidelineFiles.includes(file)
|
|
1135
|
+
);
|
|
1136
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
1137
|
+
if (prereadBlock.length > 0) {
|
|
1138
|
+
parts.push("\n", prereadBlock);
|
|
1139
|
+
}
|
|
1140
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1141
|
+
return parts.join("\n").trim();
|
|
1142
|
+
}
|
|
1143
|
+
function normalizeInputFiles2(inputFiles) {
|
|
1144
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1145
|
+
return void 0;
|
|
1146
|
+
}
|
|
1147
|
+
const deduped = /* @__PURE__ */ new Map();
|
|
1148
|
+
for (const inputFile of inputFiles) {
|
|
1149
|
+
const absolutePath = import_node_path4.default.resolve(inputFile);
|
|
1150
|
+
if (!deduped.has(absolutePath)) {
|
|
1151
|
+
deduped.set(absolutePath, absolutePath);
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
return Array.from(deduped.values());
|
|
1155
|
+
}
|
|
1156
|
+
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
1157
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1158
|
+
return [];
|
|
1159
|
+
}
|
|
1160
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1161
|
+
for (const inputFile of inputFiles) {
|
|
1162
|
+
const absolutePath = import_node_path4.default.resolve(inputFile);
|
|
1163
|
+
if (overrides?.has(absolutePath)) {
|
|
1164
|
+
if (!unique.has(absolutePath)) {
|
|
1165
|
+
unique.set(absolutePath, absolutePath);
|
|
1166
|
+
}
|
|
1167
|
+
continue;
|
|
1168
|
+
}
|
|
1169
|
+
const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
|
|
1170
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1171
|
+
if (!unique.has(absolutePath)) {
|
|
1172
|
+
unique.set(absolutePath, absolutePath);
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
return Array.from(unique.values());
|
|
1177
|
+
}
|
|
1178
|
+
function collectInputFiles(inputFiles) {
|
|
1179
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1180
|
+
return [];
|
|
1181
|
+
}
|
|
1182
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1183
|
+
for (const inputFile of inputFiles) {
|
|
1184
|
+
const absolutePath = import_node_path4.default.resolve(inputFile);
|
|
1185
|
+
if (!unique.has(absolutePath)) {
|
|
1186
|
+
unique.set(absolutePath, absolutePath);
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
return Array.from(unique.values());
|
|
1190
|
+
}
|
|
1191
|
+
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
1192
|
+
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
1193
|
+
return "";
|
|
1194
|
+
}
|
|
1195
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1196
|
+
const fileName = import_node_path4.default.basename(absolutePath);
|
|
1197
|
+
const fileUri = pathToFileUri(absolutePath);
|
|
1198
|
+
return `* [${fileName}](${fileUri})`;
|
|
1199
|
+
});
|
|
1200
|
+
const sections = [];
|
|
1201
|
+
if (guidelineFiles.length > 0) {
|
|
1202
|
+
sections.push(`Read all guideline files:
|
|
1203
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1204
|
+
}
|
|
1205
|
+
if (inputFiles.length > 0) {
|
|
1206
|
+
sections.push(`Read all input files:
|
|
1207
|
+
${buildList(inputFiles).join("\n")}.`);
|
|
1208
|
+
}
|
|
1209
|
+
sections.push(
|
|
1210
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1211
|
+
"Then apply system_instructions on the user query below."
|
|
1212
|
+
);
|
|
1213
|
+
return sections.join("\n");
|
|
1214
|
+
}
|
|
1215
|
+
function pathToFileUri(filePath) {
|
|
1216
|
+
const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
|
|
1217
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1218
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1219
|
+
return `file:///${normalizedPath}`;
|
|
1220
|
+
}
|
|
1221
|
+
return `file://${normalizedPath}`;
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
// src/evaluation/providers/codex.ts
|
|
1225
|
+
var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
|
|
1226
|
+
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
1227
|
+
var PROMPT_FILENAME = "prompt.md";
|
|
1228
|
+
var FILES_DIR = "files";
|
|
1229
|
+
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
1230
|
+
var CodexProvider = class {
|
|
1231
|
+
id;
|
|
1232
|
+
kind = "codex";
|
|
1233
|
+
targetName;
|
|
1234
|
+
supportsBatch = false;
|
|
1235
|
+
config;
|
|
1236
|
+
runCodex;
|
|
1237
|
+
environmentCheck;
|
|
1238
|
+
resolvedExecutable;
|
|
1239
|
+
constructor(targetName, config, runner = defaultCodexRunner) {
|
|
1240
|
+
this.id = `codex:${targetName}`;
|
|
1241
|
+
this.targetName = targetName;
|
|
1242
|
+
this.config = config;
|
|
1243
|
+
this.runCodex = runner;
|
|
1244
|
+
}
|
|
1245
|
+
async invoke(request) {
|
|
1246
|
+
if (request.signal?.aborted) {
|
|
1247
|
+
throw new Error("Codex provider request was aborted before execution");
|
|
1248
|
+
}
|
|
1249
|
+
await this.ensureEnvironmentReady();
|
|
1250
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
1251
|
+
const originalGuidelines = new Set(
|
|
1252
|
+
collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => import_node_path5.default.resolve(file))
|
|
1253
|
+
);
|
|
1254
|
+
const workspaceRoot = await this.createWorkspace();
|
|
1255
|
+
try {
|
|
1256
|
+
const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
|
|
1257
|
+
inputFiles,
|
|
1258
|
+
workspaceRoot,
|
|
1259
|
+
originalGuidelines
|
|
1260
|
+
);
|
|
1261
|
+
const promptContent = buildPromptDocument(request, mirroredInputFiles, {
|
|
1262
|
+
guidelinePatterns: request.guideline_patterns,
|
|
1263
|
+
guidelineOverrides: guidelineMirrors
|
|
1264
|
+
});
|
|
1265
|
+
const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
1266
|
+
await (0, import_promises3.writeFile)(promptFile, promptContent, "utf8");
|
|
1267
|
+
const args = this.buildCodexArgs();
|
|
1268
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
1269
|
+
const result = await this.executeCodex(args, cwd, promptContent, request.signal);
|
|
1270
|
+
if (result.timedOut) {
|
|
1271
|
+
throw new Error(
|
|
1272
|
+
`Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
1273
|
+
);
|
|
1274
|
+
}
|
|
1275
|
+
if (result.exitCode !== 0) {
|
|
1276
|
+
const detail = pickDetail(result.stderr, result.stdout);
|
|
1277
|
+
const prefix = `Codex CLI exited with code ${result.exitCode}`;
|
|
1278
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
1279
|
+
}
|
|
1280
|
+
const parsed = parseCodexJson(result.stdout);
|
|
1281
|
+
const assistantText = extractAssistantText(parsed);
|
|
1282
|
+
return {
|
|
1283
|
+
text: assistantText,
|
|
1284
|
+
raw: {
|
|
1285
|
+
response: parsed,
|
|
1286
|
+
stdout: result.stdout,
|
|
1287
|
+
stderr: result.stderr,
|
|
1288
|
+
exitCode: result.exitCode,
|
|
1289
|
+
args,
|
|
1290
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1291
|
+
promptFile,
|
|
1292
|
+
workspace: workspaceRoot,
|
|
1293
|
+
inputFiles: mirroredInputFiles
|
|
1294
|
+
}
|
|
1295
|
+
};
|
|
1296
|
+
} finally {
|
|
1297
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
async ensureEnvironmentReady() {
|
|
1301
|
+
if (!this.environmentCheck) {
|
|
1302
|
+
this.environmentCheck = this.validateEnvironment();
|
|
1303
|
+
}
|
|
1304
|
+
await this.environmentCheck;
|
|
1305
|
+
}
|
|
1306
|
+
async validateEnvironment() {
|
|
1307
|
+
this.resolvedExecutable = await locateExecutable(this.config.executable);
|
|
1308
|
+
}
|
|
1309
|
+
resolveCwd(workspaceRoot) {
|
|
1310
|
+
if (!this.config.cwd) {
|
|
1311
|
+
return workspaceRoot;
|
|
1312
|
+
}
|
|
1313
|
+
return import_node_path5.default.resolve(this.config.cwd);
|
|
1314
|
+
}
|
|
1315
|
+
buildCodexArgs() {
|
|
1316
|
+
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
1317
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
1318
|
+
args.push(...this.config.args);
|
|
1319
|
+
}
|
|
1320
|
+
args.push("-");
|
|
1321
|
+
return args;
|
|
1322
|
+
}
|
|
1323
|
+
async executeCodex(args, cwd, promptContent, signal) {
|
|
1324
|
+
try {
|
|
1325
|
+
return await this.runCodex({
|
|
1326
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1327
|
+
args,
|
|
1328
|
+
cwd,
|
|
1329
|
+
prompt: promptContent,
|
|
1330
|
+
timeoutMs: this.config.timeoutMs,
|
|
1331
|
+
env: process.env,
|
|
1332
|
+
signal
|
|
1333
|
+
});
|
|
1334
|
+
} catch (error) {
|
|
1335
|
+
const err = error;
|
|
1336
|
+
if (err.code === "ENOENT") {
|
|
1337
|
+
throw new Error(
|
|
1338
|
+
`Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
1339
|
+
);
|
|
1340
|
+
}
|
|
1341
|
+
throw error;
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
|
|
1345
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1346
|
+
return {
|
|
1347
|
+
mirroredInputFiles: void 0,
|
|
1348
|
+
guidelineMirrors: /* @__PURE__ */ new Set()
|
|
1349
|
+
};
|
|
1350
|
+
}
|
|
1351
|
+
const filesRoot = import_node_path5.default.join(workspaceRoot, FILES_DIR);
|
|
1352
|
+
await (0, import_promises3.mkdir)(filesRoot, { recursive: true });
|
|
1353
|
+
const mirrored = [];
|
|
1354
|
+
const guidelineMirrors = /* @__PURE__ */ new Set();
|
|
1355
|
+
const nameCounts = /* @__PURE__ */ new Map();
|
|
1356
|
+
for (const inputFile of inputFiles) {
|
|
1357
|
+
const absoluteSource = import_node_path5.default.resolve(inputFile);
|
|
1358
|
+
const baseName = import_node_path5.default.basename(absoluteSource);
|
|
1359
|
+
const count = nameCounts.get(baseName) ?? 0;
|
|
1360
|
+
nameCounts.set(baseName, count + 1);
|
|
1361
|
+
const finalName = count === 0 ? baseName : `${baseName}.${count}`;
|
|
1362
|
+
const destination = import_node_path5.default.join(filesRoot, finalName);
|
|
1363
|
+
await (0, import_promises3.copyFile)(absoluteSource, destination);
|
|
1364
|
+
const resolvedDestination = import_node_path5.default.resolve(destination);
|
|
1365
|
+
mirrored.push(resolvedDestination);
|
|
1366
|
+
if (guidelineOriginals.has(absoluteSource)) {
|
|
1367
|
+
guidelineMirrors.add(resolvedDestination);
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
return {
|
|
1371
|
+
mirroredInputFiles: mirrored,
|
|
1372
|
+
guidelineMirrors
|
|
1373
|
+
};
|
|
1374
|
+
}
|
|
1375
|
+
async createWorkspace() {
|
|
1376
|
+
return await (0, import_promises3.mkdtemp)(import_node_path5.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
|
|
1377
|
+
}
|
|
1378
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
1379
|
+
try {
|
|
1380
|
+
await (0, import_promises3.rm)(workspaceRoot, { recursive: true, force: true });
|
|
1381
|
+
} catch {
|
|
1382
|
+
}
|
|
1383
|
+
}
|
|
1384
|
+
};
|
|
1385
|
+
async function locateExecutable(candidate) {
|
|
1386
|
+
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1387
|
+
if (includesPathSeparator) {
|
|
1388
|
+
const resolved = import_node_path5.default.isAbsolute(candidate) ? candidate : import_node_path5.default.resolve(candidate);
|
|
1389
|
+
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
1390
|
+
await (0, import_promises3.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
1391
|
+
return executablePath;
|
|
1392
|
+
}
|
|
1393
|
+
const locator = process.platform === "win32" ? "where" : "which";
|
|
1394
|
+
try {
|
|
1395
|
+
const { stdout } = await execAsync2(`${locator} ${candidate}`);
|
|
1396
|
+
const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
1397
|
+
const preferred = selectExecutableCandidate(lines);
|
|
1398
|
+
if (preferred) {
|
|
1399
|
+
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
1400
|
+
await (0, import_promises3.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
1401
|
+
return executablePath;
|
|
1402
|
+
}
|
|
1403
|
+
} catch {
|
|
1404
|
+
}
|
|
1405
|
+
throw new Error(`Codex executable '${candidate}' was not found on PATH`);
|
|
1406
|
+
}
|
|
1407
|
+
function selectExecutableCandidate(candidates) {
|
|
1408
|
+
if (candidates.length === 0) {
|
|
1409
|
+
return void 0;
|
|
1410
|
+
}
|
|
1411
|
+
if (process.platform !== "win32") {
|
|
1412
|
+
return candidates[0];
|
|
1413
|
+
}
|
|
1414
|
+
const extensions = getWindowsExecutableExtensions();
|
|
1415
|
+
for (const ext of extensions) {
|
|
1416
|
+
const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
|
|
1417
|
+
if (match) {
|
|
1418
|
+
return match;
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1421
|
+
return candidates[0];
|
|
1422
|
+
}
|
|
1423
|
+
async function ensureWindowsExecutableVariant(candidate) {
|
|
1424
|
+
if (process.platform !== "win32") {
|
|
1425
|
+
return candidate;
|
|
1426
|
+
}
|
|
1427
|
+
if (hasExecutableExtension(candidate)) {
|
|
1428
|
+
return candidate;
|
|
1429
|
+
}
|
|
1430
|
+
const extensions = getWindowsExecutableExtensions();
|
|
1431
|
+
for (const ext of extensions) {
|
|
1432
|
+
const withExtension = `${candidate}${ext}`;
|
|
1433
|
+
try {
|
|
1434
|
+
await (0, import_promises3.access)(withExtension, import_node_fs3.constants.F_OK);
|
|
1435
|
+
return withExtension;
|
|
1436
|
+
} catch {
|
|
1437
|
+
}
|
|
1438
|
+
}
|
|
1439
|
+
return candidate;
|
|
1440
|
+
}
|
|
1441
|
+
function hasExecutableExtension(candidate) {
|
|
1442
|
+
const lower = candidate.toLowerCase();
|
|
1443
|
+
return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
|
|
1444
|
+
}
|
|
1445
|
+
var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
|
|
1446
|
+
function getWindowsExecutableExtensions() {
|
|
1447
|
+
if (process.platform !== "win32") {
|
|
1448
|
+
return [];
|
|
1449
|
+
}
|
|
1450
|
+
const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
|
|
1451
|
+
return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
|
|
1452
|
+
}
|
|
1453
|
+
function parseCodexJson(output) {
|
|
1454
|
+
const trimmed = output.trim();
|
|
1455
|
+
if (trimmed.length === 0) {
|
|
1456
|
+
throw new Error("Codex CLI produced no output in --json mode");
|
|
1457
|
+
}
|
|
1458
|
+
try {
|
|
1459
|
+
return JSON.parse(trimmed);
|
|
1460
|
+
} catch {
|
|
1461
|
+
const lineObjects = parseJsonLines(trimmed);
|
|
1462
|
+
if (lineObjects) {
|
|
1463
|
+
return lineObjects;
|
|
1464
|
+
}
|
|
1465
|
+
const lastBrace = trimmed.lastIndexOf("{");
|
|
1466
|
+
if (lastBrace >= 0) {
|
|
1467
|
+
const candidate = trimmed.slice(lastBrace);
|
|
1468
|
+
try {
|
|
1469
|
+
return JSON.parse(candidate);
|
|
1470
|
+
} catch {
|
|
1471
|
+
}
|
|
1472
|
+
}
|
|
1473
|
+
const preview = trimmed.slice(0, 200);
|
|
1474
|
+
throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
|
|
1475
|
+
}
|
|
1476
|
+
}
|
|
1477
|
+
function extractAssistantText(parsed) {
|
|
1478
|
+
if (Array.isArray(parsed)) {
|
|
1479
|
+
const text = extractFromEventStream(parsed);
|
|
1480
|
+
if (text) {
|
|
1481
|
+
return text;
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
if (!parsed || typeof parsed !== "object") {
|
|
1485
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
1486
|
+
}
|
|
1487
|
+
const record = parsed;
|
|
1488
|
+
const eventText = extractFromEvent(record);
|
|
1489
|
+
if (eventText) {
|
|
1490
|
+
return eventText;
|
|
1491
|
+
}
|
|
1492
|
+
const messages = Array.isArray(record.messages) ? record.messages : void 0;
|
|
1493
|
+
if (messages) {
|
|
1494
|
+
for (let index = messages.length - 1; index >= 0; index -= 1) {
|
|
1495
|
+
const entry = messages[index];
|
|
1496
|
+
if (!entry || typeof entry !== "object") {
|
|
1497
|
+
continue;
|
|
1498
|
+
}
|
|
1499
|
+
const role = entry.role;
|
|
1500
|
+
if (role !== "assistant") {
|
|
1501
|
+
continue;
|
|
1502
|
+
}
|
|
1503
|
+
const content = entry.content;
|
|
1504
|
+
const flattened = flattenContent(content);
|
|
1505
|
+
if (flattened) {
|
|
1506
|
+
return flattened;
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
}
|
|
1510
|
+
const response = record.response;
|
|
1511
|
+
if (response && typeof response === "object") {
|
|
1512
|
+
const content = response.content;
|
|
1513
|
+
const flattened = flattenContent(content);
|
|
1514
|
+
if (flattened) {
|
|
1515
|
+
return flattened;
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
const output = record.output;
|
|
1519
|
+
const flattenedOutput = flattenContent(output);
|
|
1520
|
+
if (flattenedOutput) {
|
|
1521
|
+
return flattenedOutput;
|
|
1522
|
+
}
|
|
1523
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
1524
|
+
}
|
|
1525
|
+
function extractFromEventStream(events) {
|
|
1526
|
+
for (let index = events.length - 1; index >= 0; index -= 1) {
|
|
1527
|
+
const candidate = events[index];
|
|
1528
|
+
const text = extractFromEvent(candidate);
|
|
1529
|
+
if (text) {
|
|
1530
|
+
return text;
|
|
1531
|
+
}
|
|
1532
|
+
}
|
|
1533
|
+
return void 0;
|
|
1534
|
+
}
|
|
1535
|
+
function extractFromEvent(event) {
|
|
1536
|
+
if (!event || typeof event !== "object") {
|
|
1537
|
+
return void 0;
|
|
1538
|
+
}
|
|
1539
|
+
const record = event;
|
|
1540
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
1541
|
+
if (type === JSONL_TYPE_ITEM_COMPLETED) {
|
|
1542
|
+
const item = record.item;
|
|
1543
|
+
const text = extractFromItem(item);
|
|
1544
|
+
if (text) {
|
|
1545
|
+
return text;
|
|
1546
|
+
}
|
|
1547
|
+
}
|
|
1548
|
+
const output = record.output ?? record.content;
|
|
1549
|
+
const flattened = flattenContent(output);
|
|
1550
|
+
if (flattened) {
|
|
1551
|
+
return flattened;
|
|
1552
|
+
}
|
|
1553
|
+
return void 0;
|
|
1554
|
+
}
|
|
1555
|
+
function extractFromItem(item) {
|
|
1556
|
+
if (!item || typeof item !== "object") {
|
|
1557
|
+
return void 0;
|
|
1558
|
+
}
|
|
1559
|
+
const record = item;
|
|
1560
|
+
const itemType = typeof record.type === "string" ? record.type : void 0;
|
|
1561
|
+
if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
|
|
1562
|
+
const text = flattenContent(record.text ?? record.content ?? record.output);
|
|
1563
|
+
if (text) {
|
|
1564
|
+
return text;
|
|
1565
|
+
}
|
|
1566
|
+
}
|
|
1567
|
+
return void 0;
|
|
1568
|
+
}
|
|
1569
|
+
function flattenContent(value) {
|
|
1570
|
+
if (typeof value === "string") {
|
|
1571
|
+
return value;
|
|
1572
|
+
}
|
|
1573
|
+
if (Array.isArray(value)) {
|
|
1574
|
+
const parts = value.map((segment) => {
|
|
1575
|
+
if (typeof segment === "string") {
|
|
1576
|
+
return segment;
|
|
1577
|
+
}
|
|
1578
|
+
if (segment && typeof segment === "object" && "text" in segment) {
|
|
1579
|
+
const text = segment.text;
|
|
1580
|
+
return typeof text === "string" ? text : void 0;
|
|
1581
|
+
}
|
|
1582
|
+
return void 0;
|
|
1583
|
+
}).filter((part) => typeof part === "string" && part.length > 0);
|
|
1584
|
+
return parts.length > 0 ? parts.join(" \n") : void 0;
|
|
1585
|
+
}
|
|
1586
|
+
if (value && typeof value === "object" && "text" in value) {
|
|
1587
|
+
const text = value.text;
|
|
1588
|
+
return typeof text === "string" ? text : void 0;
|
|
1589
|
+
}
|
|
1590
|
+
return void 0;
|
|
1591
|
+
}
|
|
1592
|
+
function parseJsonLines(output) {
|
|
1593
|
+
const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
1594
|
+
if (lines.length <= 1) {
|
|
1595
|
+
return void 0;
|
|
1596
|
+
}
|
|
1597
|
+
const parsed = [];
|
|
1598
|
+
for (const line of lines) {
|
|
1599
|
+
try {
|
|
1600
|
+
parsed.push(JSON.parse(line));
|
|
1601
|
+
} catch {
|
|
1602
|
+
return void 0;
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
return parsed;
|
|
1606
|
+
}
|
|
1607
|
+
function pickDetail(stderr, stdout) {
|
|
1608
|
+
const errorText = stderr.trim();
|
|
1609
|
+
if (errorText.length > 0) {
|
|
1610
|
+
return errorText;
|
|
1611
|
+
}
|
|
1612
|
+
const stdoutText = stdout.trim();
|
|
1613
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
1614
|
+
}
|
|
1615
|
+
function formatTimeoutSuffix2(timeoutMs) {
|
|
1616
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
1617
|
+
return "";
|
|
1618
|
+
}
|
|
1619
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
1620
|
+
return ` after ${seconds}s`;
|
|
1621
|
+
}
|
|
1622
|
+
async function defaultCodexRunner(options) {
|
|
1623
|
+
return await new Promise((resolve, reject) => {
|
|
1624
|
+
const child = (0, import_node_child_process2.spawn)(options.executable, options.args, {
|
|
1625
|
+
cwd: options.cwd,
|
|
1626
|
+
env: options.env,
|
|
1627
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
1628
|
+
shell: shouldShellExecute(options.executable)
|
|
1629
|
+
});
|
|
1630
|
+
let stdout = "";
|
|
1631
|
+
let stderr = "";
|
|
1632
|
+
let timedOut = false;
|
|
1633
|
+
const onAbort = () => {
|
|
1634
|
+
child.kill("SIGTERM");
|
|
1635
|
+
};
|
|
1636
|
+
if (options.signal) {
|
|
1637
|
+
if (options.signal.aborted) {
|
|
1638
|
+
onAbort();
|
|
1639
|
+
} else {
|
|
1640
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
1641
|
+
}
|
|
1642
|
+
}
|
|
1643
|
+
let timeoutHandle;
|
|
1644
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
1645
|
+
timeoutHandle = setTimeout(() => {
|
|
1646
|
+
timedOut = true;
|
|
1647
|
+
child.kill("SIGTERM");
|
|
1648
|
+
}, options.timeoutMs);
|
|
1649
|
+
timeoutHandle.unref?.();
|
|
1650
|
+
}
|
|
1651
|
+
child.stdout.setEncoding("utf8");
|
|
1652
|
+
child.stdout.on("data", (chunk) => {
|
|
1653
|
+
stdout += chunk;
|
|
1654
|
+
});
|
|
1655
|
+
child.stderr.setEncoding("utf8");
|
|
1656
|
+
child.stderr.on("data", (chunk) => {
|
|
1657
|
+
stderr += chunk;
|
|
1658
|
+
});
|
|
1659
|
+
child.stdin.end(options.prompt);
|
|
1660
|
+
const cleanup = () => {
|
|
1661
|
+
if (timeoutHandle) {
|
|
1662
|
+
clearTimeout(timeoutHandle);
|
|
1663
|
+
}
|
|
1664
|
+
if (options.signal) {
|
|
1665
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
1666
|
+
}
|
|
1667
|
+
};
|
|
1668
|
+
child.on("error", (error) => {
|
|
1669
|
+
cleanup();
|
|
1670
|
+
reject(error);
|
|
1671
|
+
});
|
|
1672
|
+
child.on("close", (code) => {
|
|
1673
|
+
cleanup();
|
|
1674
|
+
resolve({
|
|
1675
|
+
stdout,
|
|
1676
|
+
stderr,
|
|
1677
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
1678
|
+
timedOut
|
|
1679
|
+
});
|
|
1680
|
+
});
|
|
1681
|
+
});
|
|
1682
|
+
}
|
|
1683
|
+
function shouldShellExecute(executable) {
|
|
1684
|
+
if (process.platform !== "win32") {
|
|
1685
|
+
return false;
|
|
1686
|
+
}
|
|
1687
|
+
const lower = executable.toLowerCase();
|
|
1688
|
+
return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
|
|
1689
|
+
}
|
|
1690
|
+
|
|
838
1691
|
// src/evaluation/providers/mock.ts
|
|
839
1692
|
var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
|
|
840
1693
|
var MockProvider = class {
|
|
@@ -878,6 +1731,7 @@ var MockProvider = class {
|
|
|
878
1731
|
|
|
879
1732
|
// src/evaluation/providers/targets.ts
|
|
880
1733
|
var import_zod = require("zod");
|
|
1734
|
+
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
|
|
881
1735
|
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
882
1736
|
name: import_zod.z.string().min(1, "target name is required"),
|
|
883
1737
|
provider: import_zod.z.string().min(1, "provider is required"),
|
|
@@ -934,6 +1788,16 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
934
1788
|
providerBatching,
|
|
935
1789
|
config: resolveGeminiConfig(parsed, env)
|
|
936
1790
|
};
|
|
1791
|
+
case "codex":
|
|
1792
|
+
case "codex-cli":
|
|
1793
|
+
return {
|
|
1794
|
+
kind: "codex",
|
|
1795
|
+
name: parsed.name,
|
|
1796
|
+
judgeTarget: parsed.judge_target,
|
|
1797
|
+
workers: parsed.workers,
|
|
1798
|
+
providerBatching,
|
|
1799
|
+
config: resolveCodexConfig(parsed, env)
|
|
1800
|
+
};
|
|
937
1801
|
case "mock":
|
|
938
1802
|
return {
|
|
939
1803
|
kind: "mock",
|
|
@@ -951,7 +1815,16 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
951
1815
|
judgeTarget: parsed.judge_target,
|
|
952
1816
|
workers: parsed.workers,
|
|
953
1817
|
providerBatching,
|
|
954
|
-
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
1818
|
+
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
1819
|
+
};
|
|
1820
|
+
case "cli":
|
|
1821
|
+
return {
|
|
1822
|
+
kind: "cli",
|
|
1823
|
+
name: parsed.name,
|
|
1824
|
+
judgeTarget: parsed.judge_target,
|
|
1825
|
+
workers: parsed.workers,
|
|
1826
|
+
providerBatching,
|
|
1827
|
+
config: resolveCliConfig(parsed, env)
|
|
955
1828
|
};
|
|
956
1829
|
default:
|
|
957
1830
|
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
@@ -1020,6 +1893,29 @@ function resolveGeminiConfig(target, env) {
|
|
|
1020
1893
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
|
|
1021
1894
|
};
|
|
1022
1895
|
}
|
|
1896
|
+
function resolveCodexConfig(target, env) {
|
|
1897
|
+
const settings = target.settings ?? {};
|
|
1898
|
+
const executableSource = settings.executable ?? settings.command ?? settings.binary;
|
|
1899
|
+
const argsSource = settings.args ?? settings.arguments;
|
|
1900
|
+
const cwdSource = settings.cwd;
|
|
1901
|
+
const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
|
|
1902
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
1903
|
+
allowLiteral: true,
|
|
1904
|
+
optionalEnv: true
|
|
1905
|
+
}) ?? "codex";
|
|
1906
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
|
|
1907
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
|
|
1908
|
+
allowLiteral: true,
|
|
1909
|
+
optionalEnv: true
|
|
1910
|
+
});
|
|
1911
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
|
|
1912
|
+
return {
|
|
1913
|
+
executable,
|
|
1914
|
+
args,
|
|
1915
|
+
cwd,
|
|
1916
|
+
timeoutMs
|
|
1917
|
+
};
|
|
1918
|
+
}
|
|
1023
1919
|
function resolveMockConfig(target) {
|
|
1024
1920
|
const settings = target.settings ?? {};
|
|
1025
1921
|
const response = typeof settings.response === "string" ? settings.response : void 0;
|
|
@@ -1049,6 +1945,125 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
1049
1945
|
workspaceTemplate
|
|
1050
1946
|
};
|
|
1051
1947
|
}
|
|
1948
|
+
function resolveCliConfig(target, env) {
|
|
1949
|
+
const settings = target.settings ?? {};
|
|
1950
|
+
const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
|
|
1951
|
+
const filesFormat = resolveOptionalLiteralString(
|
|
1952
|
+
settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
|
|
1953
|
+
);
|
|
1954
|
+
const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
|
|
1955
|
+
allowLiteral: true,
|
|
1956
|
+
optionalEnv: true
|
|
1957
|
+
});
|
|
1958
|
+
const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
|
|
1959
|
+
const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
|
|
1960
|
+
const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
|
|
1961
|
+
const commandTemplate = resolveString(
|
|
1962
|
+
commandTemplateSource,
|
|
1963
|
+
env,
|
|
1964
|
+
`${target.name} CLI command template`,
|
|
1965
|
+
true
|
|
1966
|
+
);
|
|
1967
|
+
assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
|
|
1968
|
+
return {
|
|
1969
|
+
commandTemplate,
|
|
1970
|
+
filesFormat,
|
|
1971
|
+
cwd,
|
|
1972
|
+
env: envOverrides,
|
|
1973
|
+
timeoutMs,
|
|
1974
|
+
healthcheck
|
|
1975
|
+
};
|
|
1976
|
+
}
|
|
1977
|
+
function resolveEnvOverrides(source, env, targetName) {
|
|
1978
|
+
if (source === void 0 || source === null) {
|
|
1979
|
+
return void 0;
|
|
1980
|
+
}
|
|
1981
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
1982
|
+
throw new Error(`${targetName} env overrides must be an object map of strings`);
|
|
1983
|
+
}
|
|
1984
|
+
const entries = Object.entries(source);
|
|
1985
|
+
const resolved = {};
|
|
1986
|
+
for (const [key, value] of entries) {
|
|
1987
|
+
if (typeof value !== "string") {
|
|
1988
|
+
throw new Error(`${targetName} env override '${key}' must be a string`);
|
|
1989
|
+
}
|
|
1990
|
+
const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
|
|
1991
|
+
resolved[key] = resolvedValue;
|
|
1992
|
+
}
|
|
1993
|
+
return Object.keys(resolved).length > 0 ? resolved : void 0;
|
|
1994
|
+
}
|
|
1995
|
+
function resolveTimeoutMs(source, description) {
|
|
1996
|
+
const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
|
|
1997
|
+
if (seconds === void 0) {
|
|
1998
|
+
return void 0;
|
|
1999
|
+
}
|
|
2000
|
+
if (seconds <= 0) {
|
|
2001
|
+
throw new Error(`${description} must be greater than zero seconds`);
|
|
2002
|
+
}
|
|
2003
|
+
return Math.floor(seconds * 1e3);
|
|
2004
|
+
}
|
|
2005
|
+
function resolveCliHealthcheck(source, env, targetName) {
|
|
2006
|
+
if (source === void 0 || source === null) {
|
|
2007
|
+
return void 0;
|
|
2008
|
+
}
|
|
2009
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
2010
|
+
throw new Error(`${targetName} healthcheck must be an object`);
|
|
2011
|
+
}
|
|
2012
|
+
const candidate = source;
|
|
2013
|
+
const type = candidate.type;
|
|
2014
|
+
const timeoutMs = resolveTimeoutMs(
|
|
2015
|
+
candidate.timeout_seconds ?? candidate.timeoutSeconds,
|
|
2016
|
+
`${targetName} healthcheck timeout`
|
|
2017
|
+
);
|
|
2018
|
+
if (type === "http") {
|
|
2019
|
+
const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
|
|
2020
|
+
return {
|
|
2021
|
+
type: "http",
|
|
2022
|
+
url,
|
|
2023
|
+
timeoutMs
|
|
2024
|
+
};
|
|
2025
|
+
}
|
|
2026
|
+
if (type === "command") {
|
|
2027
|
+
const commandTemplate = resolveString(
|
|
2028
|
+
candidate.command_template ?? candidate.commandTemplate,
|
|
2029
|
+
env,
|
|
2030
|
+
`${targetName} healthcheck command template`,
|
|
2031
|
+
true
|
|
2032
|
+
);
|
|
2033
|
+
assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
|
|
2034
|
+
const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
|
|
2035
|
+
allowLiteral: true,
|
|
2036
|
+
optionalEnv: true
|
|
2037
|
+
});
|
|
2038
|
+
return {
|
|
2039
|
+
type: "command",
|
|
2040
|
+
commandTemplate,
|
|
2041
|
+
timeoutMs,
|
|
2042
|
+
cwd
|
|
2043
|
+
};
|
|
2044
|
+
}
|
|
2045
|
+
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
2046
|
+
}
|
|
2047
|
+
function assertSupportedCliPlaceholders(template, description) {
|
|
2048
|
+
const placeholders = extractCliPlaceholders(template);
|
|
2049
|
+
for (const placeholder of placeholders) {
|
|
2050
|
+
if (!CLI_PLACEHOLDERS.has(placeholder)) {
|
|
2051
|
+
throw new Error(
|
|
2052
|
+
`${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
|
|
2053
|
+
);
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
2056
|
+
}
|
|
2057
|
+
function extractCliPlaceholders(template) {
|
|
2058
|
+
const matches = template.matchAll(/\{([A-Z_]+)\}/g);
|
|
2059
|
+
const results = [];
|
|
2060
|
+
for (const match of matches) {
|
|
2061
|
+
if (match[1]) {
|
|
2062
|
+
results.push(match[1]);
|
|
2063
|
+
}
|
|
2064
|
+
}
|
|
2065
|
+
return results;
|
|
2066
|
+
}
|
|
1052
2067
|
function resolveString(source, env, description, allowLiteral = false) {
|
|
1053
2068
|
const value = resolveOptionalString(source, env, description, {
|
|
1054
2069
|
allowLiteral,
|
|
@@ -1079,11 +2094,14 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
1079
2094
|
}
|
|
1080
2095
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
1081
2096
|
const optionalEnv = options?.optionalEnv ?? false;
|
|
1082
|
-
|
|
2097
|
+
const looksLikeEnv = isLikelyEnvReference(trimmed);
|
|
2098
|
+
if (looksLikeEnv) {
|
|
1083
2099
|
if (optionalEnv) {
|
|
1084
2100
|
return void 0;
|
|
1085
2101
|
}
|
|
1086
|
-
|
|
2102
|
+
if (!allowLiteral) {
|
|
2103
|
+
throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
|
|
2104
|
+
}
|
|
1087
2105
|
}
|
|
1088
2106
|
return trimmed;
|
|
1089
2107
|
}
|
|
@@ -1133,10 +2151,42 @@ function resolveOptionalBoolean(source) {
|
|
|
1133
2151
|
function isLikelyEnvReference(value) {
|
|
1134
2152
|
return /^[A-Z0-9_]+$/.test(value);
|
|
1135
2153
|
}
|
|
2154
|
+
function resolveOptionalStringArray(source, env, description) {
|
|
2155
|
+
if (source === void 0 || source === null) {
|
|
2156
|
+
return void 0;
|
|
2157
|
+
}
|
|
2158
|
+
if (!Array.isArray(source)) {
|
|
2159
|
+
throw new Error(`${description} must be an array of strings`);
|
|
2160
|
+
}
|
|
2161
|
+
if (source.length === 0) {
|
|
2162
|
+
return void 0;
|
|
2163
|
+
}
|
|
2164
|
+
const resolved = [];
|
|
2165
|
+
for (let i = 0; i < source.length; i++) {
|
|
2166
|
+
const item = source[i];
|
|
2167
|
+
if (typeof item !== "string") {
|
|
2168
|
+
throw new Error(`${description}[${i}] must be a string`);
|
|
2169
|
+
}
|
|
2170
|
+
const trimmed = item.trim();
|
|
2171
|
+
if (trimmed.length === 0) {
|
|
2172
|
+
throw new Error(`${description}[${i}] cannot be empty`);
|
|
2173
|
+
}
|
|
2174
|
+
const envValue = env[trimmed];
|
|
2175
|
+
if (envValue !== void 0) {
|
|
2176
|
+
if (envValue.trim().length === 0) {
|
|
2177
|
+
throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
|
|
2178
|
+
}
|
|
2179
|
+
resolved.push(envValue);
|
|
2180
|
+
} else {
|
|
2181
|
+
resolved.push(trimmed);
|
|
2182
|
+
}
|
|
2183
|
+
}
|
|
2184
|
+
return resolved.length > 0 ? resolved : void 0;
|
|
2185
|
+
}
|
|
1136
2186
|
|
|
1137
2187
|
// src/evaluation/providers/vscode.ts
|
|
1138
|
-
var
|
|
1139
|
-
var
|
|
2188
|
+
var import_promises4 = require("fs/promises");
|
|
2189
|
+
var import_node_path6 = __toESM(require("path"), 1);
|
|
1140
2190
|
var import_subagent = require("subagent");
|
|
1141
2191
|
var VSCodeProvider = class {
|
|
1142
2192
|
id;
|
|
@@ -1154,12 +2204,11 @@ var VSCodeProvider = class {
|
|
|
1154
2204
|
if (request.signal?.aborted) {
|
|
1155
2205
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
1156
2206
|
}
|
|
1157
|
-
const
|
|
1158
|
-
const promptContent =
|
|
2207
|
+
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
2208
|
+
const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
|
|
1159
2209
|
const session = await (0, import_subagent.dispatchAgentSession)({
|
|
1160
2210
|
userQuery: promptContent,
|
|
1161
|
-
|
|
1162
|
-
extraAttachments: attachments,
|
|
2211
|
+
extraAttachments: inputFiles,
|
|
1163
2212
|
wait: this.config.waitForResponse,
|
|
1164
2213
|
dryRun: this.config.dryRun,
|
|
1165
2214
|
vscodeCmd: this.config.command,
|
|
@@ -1176,16 +2225,16 @@ var VSCodeProvider = class {
|
|
|
1176
2225
|
text: "",
|
|
1177
2226
|
raw: {
|
|
1178
2227
|
session,
|
|
1179
|
-
|
|
2228
|
+
inputFiles
|
|
1180
2229
|
}
|
|
1181
2230
|
};
|
|
1182
2231
|
}
|
|
1183
|
-
const responseText = await (0,
|
|
2232
|
+
const responseText = await (0, import_promises4.readFile)(session.responseFile, "utf8");
|
|
1184
2233
|
return {
|
|
1185
2234
|
text: responseText,
|
|
1186
2235
|
raw: {
|
|
1187
2236
|
session,
|
|
1188
|
-
|
|
2237
|
+
inputFiles
|
|
1189
2238
|
}
|
|
1190
2239
|
};
|
|
1191
2240
|
}
|
|
@@ -1195,17 +2244,17 @@ var VSCodeProvider = class {
|
|
|
1195
2244
|
}
|
|
1196
2245
|
const normalizedRequests = requests.map((req) => ({
|
|
1197
2246
|
request: req,
|
|
1198
|
-
|
|
2247
|
+
inputFiles: normalizeAttachments(req.inputFiles)
|
|
1199
2248
|
}));
|
|
1200
|
-
const
|
|
1201
|
-
normalizedRequests.map(({
|
|
2249
|
+
const combinedInputFiles = mergeAttachments(
|
|
2250
|
+
normalizedRequests.map(({ inputFiles }) => inputFiles)
|
|
1202
2251
|
);
|
|
1203
2252
|
const userQueries = normalizedRequests.map(
|
|
1204
|
-
({ request,
|
|
2253
|
+
({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
|
|
1205
2254
|
);
|
|
1206
2255
|
const session = await (0, import_subagent.dispatchBatchAgent)({
|
|
1207
2256
|
userQueries,
|
|
1208
|
-
extraAttachments:
|
|
2257
|
+
extraAttachments: combinedInputFiles,
|
|
1209
2258
|
wait: this.config.waitForResponse,
|
|
1210
2259
|
dryRun: this.config.dryRun,
|
|
1211
2260
|
vscodeCmd: this.config.command,
|
|
@@ -1218,12 +2267,12 @@ var VSCodeProvider = class {
|
|
|
1218
2267
|
throw new Error(failure);
|
|
1219
2268
|
}
|
|
1220
2269
|
if (this.config.dryRun) {
|
|
1221
|
-
return normalizedRequests.map(({
|
|
2270
|
+
return normalizedRequests.map(({ inputFiles }) => ({
|
|
1222
2271
|
text: "",
|
|
1223
2272
|
raw: {
|
|
1224
2273
|
session,
|
|
1225
|
-
|
|
1226
|
-
|
|
2274
|
+
inputFiles,
|
|
2275
|
+
allInputFiles: combinedInputFiles
|
|
1227
2276
|
}
|
|
1228
2277
|
}));
|
|
1229
2278
|
}
|
|
@@ -1234,13 +2283,13 @@ var VSCodeProvider = class {
|
|
|
1234
2283
|
}
|
|
1235
2284
|
const responses = [];
|
|
1236
2285
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
1237
|
-
const responseText = await (0,
|
|
2286
|
+
const responseText = await (0, import_promises4.readFile)(responseFile, "utf8");
|
|
1238
2287
|
responses.push({
|
|
1239
2288
|
text: responseText,
|
|
1240
2289
|
raw: {
|
|
1241
2290
|
session,
|
|
1242
|
-
|
|
1243
|
-
|
|
2291
|
+
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
2292
|
+
allInputFiles: combinedInputFiles,
|
|
1244
2293
|
responseFile
|
|
1245
2294
|
}
|
|
1246
2295
|
});
|
|
@@ -1248,27 +2297,27 @@ var VSCodeProvider = class {
|
|
|
1248
2297
|
return responses;
|
|
1249
2298
|
}
|
|
1250
2299
|
};
|
|
1251
|
-
function
|
|
2300
|
+
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
1252
2301
|
const parts = [];
|
|
1253
|
-
const guidelineFiles =
|
|
2302
|
+
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
1254
2303
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
1255
2304
|
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
1256
2305
|
(file) => !guidelineFiles.includes(file)
|
|
1257
2306
|
);
|
|
1258
|
-
const prereadBlock =
|
|
2307
|
+
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
|
|
1259
2308
|
if (prereadBlock.length > 0) {
|
|
1260
2309
|
parts.push("\n", prereadBlock);
|
|
1261
2310
|
}
|
|
1262
2311
|
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1263
2312
|
return parts.join("\n").trim();
|
|
1264
2313
|
}
|
|
1265
|
-
function
|
|
2314
|
+
function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
1266
2315
|
if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
|
|
1267
2316
|
return "";
|
|
1268
2317
|
}
|
|
1269
2318
|
const buildList = (files) => files.map((absolutePath) => {
|
|
1270
|
-
const fileName =
|
|
1271
|
-
const fileUri =
|
|
2319
|
+
const fileName = import_node_path6.default.basename(absolutePath);
|
|
2320
|
+
const fileUri = pathToFileUri2(absolutePath);
|
|
1272
2321
|
return `* [${fileName}](${fileUri})`;
|
|
1273
2322
|
});
|
|
1274
2323
|
const sections = [];
|
|
@@ -1286,14 +2335,14 @@ ${buildList(attachmentFiles).join("\n")}.`);
|
|
|
1286
2335
|
);
|
|
1287
2336
|
return sections.join("\n");
|
|
1288
2337
|
}
|
|
1289
|
-
function
|
|
2338
|
+
function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
1290
2339
|
if (!attachments || attachments.length === 0) {
|
|
1291
2340
|
return [];
|
|
1292
2341
|
}
|
|
1293
2342
|
const unique = /* @__PURE__ */ new Map();
|
|
1294
2343
|
for (const attachment of attachments) {
|
|
1295
|
-
const absolutePath =
|
|
1296
|
-
const normalized = absolutePath.split(
|
|
2344
|
+
const absolutePath = import_node_path6.default.resolve(attachment);
|
|
2345
|
+
const normalized = absolutePath.split(import_node_path6.default.sep).join("/");
|
|
1297
2346
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1298
2347
|
if (!unique.has(absolutePath)) {
|
|
1299
2348
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1308,15 +2357,15 @@ function collectAttachmentFiles(attachments) {
|
|
|
1308
2357
|
}
|
|
1309
2358
|
const unique = /* @__PURE__ */ new Map();
|
|
1310
2359
|
for (const attachment of attachments) {
|
|
1311
|
-
const absolutePath =
|
|
2360
|
+
const absolutePath = import_node_path6.default.resolve(attachment);
|
|
1312
2361
|
if (!unique.has(absolutePath)) {
|
|
1313
2362
|
unique.set(absolutePath, absolutePath);
|
|
1314
2363
|
}
|
|
1315
2364
|
}
|
|
1316
2365
|
return Array.from(unique.values());
|
|
1317
2366
|
}
|
|
1318
|
-
function
|
|
1319
|
-
const absolutePath =
|
|
2367
|
+
function pathToFileUri2(filePath) {
|
|
2368
|
+
const absolutePath = import_node_path6.default.isAbsolute(filePath) ? filePath : import_node_path6.default.resolve(filePath);
|
|
1320
2369
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1321
2370
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1322
2371
|
return `file:///${normalizedPath}`;
|
|
@@ -1329,7 +2378,7 @@ function normalizeAttachments(attachments) {
|
|
|
1329
2378
|
}
|
|
1330
2379
|
const deduped = /* @__PURE__ */ new Set();
|
|
1331
2380
|
for (const attachment of attachments) {
|
|
1332
|
-
deduped.add(
|
|
2381
|
+
deduped.add(import_node_path6.default.resolve(attachment));
|
|
1333
2382
|
}
|
|
1334
2383
|
return Array.from(deduped);
|
|
1335
2384
|
}
|
|
@@ -1337,8 +2386,8 @@ function mergeAttachments(all) {
|
|
|
1337
2386
|
const deduped = /* @__PURE__ */ new Set();
|
|
1338
2387
|
for (const list of all) {
|
|
1339
2388
|
if (!list) continue;
|
|
1340
|
-
for (const
|
|
1341
|
-
deduped.add(
|
|
2389
|
+
for (const inputFile of list) {
|
|
2390
|
+
deduped.add(import_node_path6.default.resolve(inputFile));
|
|
1342
2391
|
}
|
|
1343
2392
|
}
|
|
1344
2393
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -1383,9 +2432,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
1383
2432
|
}
|
|
1384
2433
|
|
|
1385
2434
|
// src/evaluation/providers/targets-file.ts
|
|
1386
|
-
var
|
|
1387
|
-
var
|
|
1388
|
-
var
|
|
2435
|
+
var import_node_fs4 = require("fs");
|
|
2436
|
+
var import_promises5 = require("fs/promises");
|
|
2437
|
+
var import_node_path7 = __toESM(require("path"), 1);
|
|
1389
2438
|
var import_yaml2 = require("yaml");
|
|
1390
2439
|
|
|
1391
2440
|
// src/evaluation/providers/types.ts
|
|
@@ -1446,18 +2495,18 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
1446
2495
|
}
|
|
1447
2496
|
async function fileExists3(filePath) {
|
|
1448
2497
|
try {
|
|
1449
|
-
await (0,
|
|
2498
|
+
await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
|
|
1450
2499
|
return true;
|
|
1451
2500
|
} catch {
|
|
1452
2501
|
return false;
|
|
1453
2502
|
}
|
|
1454
2503
|
}
|
|
1455
2504
|
async function readTargetDefinitions(filePath) {
|
|
1456
|
-
const absolutePath =
|
|
2505
|
+
const absolutePath = import_node_path7.default.resolve(filePath);
|
|
1457
2506
|
if (!await fileExists3(absolutePath)) {
|
|
1458
2507
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
1459
2508
|
}
|
|
1460
|
-
const raw = await (0,
|
|
2509
|
+
const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
|
|
1461
2510
|
const parsed = (0, import_yaml2.parse)(raw);
|
|
1462
2511
|
if (!isRecord(parsed)) {
|
|
1463
2512
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
@@ -1480,6 +2529,10 @@ function createProvider(target) {
|
|
|
1480
2529
|
return new AnthropicProvider(target.name, target.config);
|
|
1481
2530
|
case "gemini":
|
|
1482
2531
|
return new GeminiProvider(target.name, target.config);
|
|
2532
|
+
case "cli":
|
|
2533
|
+
return new CliProvider(target.name, target.config);
|
|
2534
|
+
case "codex":
|
|
2535
|
+
return new CodexProvider(target.name, target.config);
|
|
1483
2536
|
case "mock":
|
|
1484
2537
|
return new MockProvider(target.name, target.config);
|
|
1485
2538
|
case "vscode":
|
|
@@ -1496,230 +2549,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
1496
2549
|
return createProvider(resolved);
|
|
1497
2550
|
}
|
|
1498
2551
|
|
|
1499
|
-
// src/evaluation/
|
|
1500
|
-
var KEY_TERM_MATCH_THRESHOLD = 0.5;
|
|
1501
|
-
var ACTION_WORDS = /* @__PURE__ */ new Set([
|
|
1502
|
-
"use",
|
|
1503
|
-
"avoid",
|
|
1504
|
-
"prefer",
|
|
1505
|
-
"replace",
|
|
1506
|
-
"consider",
|
|
1507
|
-
"ensure",
|
|
1508
|
-
"remove",
|
|
1509
|
-
"add"
|
|
1510
|
-
]);
|
|
1511
|
-
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
1512
|
-
"the",
|
|
1513
|
-
"a",
|
|
1514
|
-
"an",
|
|
1515
|
-
"and",
|
|
1516
|
-
"or",
|
|
1517
|
-
"but",
|
|
1518
|
-
"in",
|
|
1519
|
-
"on",
|
|
1520
|
-
"at",
|
|
1521
|
-
"to",
|
|
1522
|
-
"for",
|
|
1523
|
-
"of",
|
|
1524
|
-
"with",
|
|
1525
|
-
"by",
|
|
1526
|
-
"is",
|
|
1527
|
-
"are",
|
|
1528
|
-
"was",
|
|
1529
|
-
"were",
|
|
1530
|
-
"be",
|
|
1531
|
-
"been",
|
|
1532
|
-
"being",
|
|
1533
|
-
"have",
|
|
1534
|
-
"has",
|
|
1535
|
-
"had",
|
|
1536
|
-
"do",
|
|
1537
|
-
"does",
|
|
1538
|
-
"did",
|
|
1539
|
-
"will",
|
|
1540
|
-
"would",
|
|
1541
|
-
"could",
|
|
1542
|
-
"should"
|
|
1543
|
-
]);
|
|
1544
|
-
var ERROR_PREFIXES = [
|
|
1545
|
-
"error:",
|
|
1546
|
-
"err:",
|
|
1547
|
-
"vs code command failed",
|
|
1548
|
-
"exception",
|
|
1549
|
-
"traceback",
|
|
1550
|
-
"no response file was generated",
|
|
1551
|
-
"timed out",
|
|
1552
|
-
"cli not found"
|
|
1553
|
-
];
|
|
1554
|
-
function extractAspects(expectedResponse) {
|
|
1555
|
-
const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
|
|
1556
|
-
const aspects = [];
|
|
1557
|
-
for (const line of lines) {
|
|
1558
|
-
if (line.length === 0) {
|
|
1559
|
-
continue;
|
|
1560
|
-
}
|
|
1561
|
-
const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
|
|
1562
|
-
if (bulletMatch) {
|
|
1563
|
-
const normalized = normalizeAspect(bulletMatch[2]);
|
|
1564
|
-
if (normalized.length > 0) {
|
|
1565
|
-
aspects.push(normalized);
|
|
1566
|
-
}
|
|
1567
|
-
continue;
|
|
1568
|
-
}
|
|
1569
|
-
const lowered = line.toLowerCase();
|
|
1570
|
-
if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
|
|
1571
|
-
const normalized = normalizeAspect(line);
|
|
1572
|
-
if (normalized.length > 0) {
|
|
1573
|
-
aspects.push(normalized);
|
|
1574
|
-
}
|
|
1575
|
-
}
|
|
1576
|
-
}
|
|
1577
|
-
return aspects;
|
|
1578
|
-
}
|
|
1579
|
-
function calculateHits(candidateResponse, expectedAspects) {
|
|
1580
|
-
const { normalizedText, words } = normalizeCandidate(candidateResponse);
|
|
1581
|
-
const hits = [];
|
|
1582
|
-
for (const aspect of expectedAspects) {
|
|
1583
|
-
if (matchesAspect(aspect, normalizedText, words)) {
|
|
1584
|
-
hits.push(aspect);
|
|
1585
|
-
}
|
|
1586
|
-
}
|
|
1587
|
-
return hits;
|
|
1588
|
-
}
|
|
1589
|
-
function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
|
|
1590
|
-
const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
|
|
1591
|
-
return expectedAspects.filter((aspect) => !hits.has(aspect));
|
|
1592
|
-
}
|
|
1593
|
-
function scoreCandidateResponse(candidateResponse, expectedAspects) {
|
|
1594
|
-
if (expectedAspects.length === 0) {
|
|
1595
|
-
if (isErrorLike(candidateResponse)) {
|
|
1596
|
-
return {
|
|
1597
|
-
score: 0,
|
|
1598
|
-
hits: [],
|
|
1599
|
-
misses: ["Model produced an error instead of an answer."],
|
|
1600
|
-
hitCount: 0,
|
|
1601
|
-
totalAspects: 0,
|
|
1602
|
-
rawAspects: []
|
|
1603
|
-
};
|
|
1604
|
-
}
|
|
1605
|
-
return {
|
|
1606
|
-
score: 1,
|
|
1607
|
-
hits: [],
|
|
1608
|
-
misses: [],
|
|
1609
|
-
hitCount: 0,
|
|
1610
|
-
totalAspects: 0,
|
|
1611
|
-
rawAspects: []
|
|
1612
|
-
};
|
|
1613
|
-
}
|
|
1614
|
-
const hits = calculateHits(candidateResponse, expectedAspects);
|
|
1615
|
-
const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
|
|
1616
|
-
const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
|
|
1617
|
-
return {
|
|
1618
|
-
score,
|
|
1619
|
-
hits,
|
|
1620
|
-
misses,
|
|
1621
|
-
hitCount: hits.length,
|
|
1622
|
-
totalAspects: expectedAspects.length,
|
|
1623
|
-
rawAspects: expectedAspects
|
|
1624
|
-
};
|
|
1625
|
-
}
|
|
1626
|
-
function isErrorLike(text) {
|
|
1627
|
-
if (!text) {
|
|
1628
|
-
return false;
|
|
1629
|
-
}
|
|
1630
|
-
const lowered = text.trim().toLowerCase();
|
|
1631
|
-
return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
|
|
1632
|
-
}
|
|
1633
|
-
function normalizeAspect(aspect) {
|
|
1634
|
-
const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
1635
|
-
return sanitized;
|
|
1636
|
-
}
|
|
1637
|
-
function normalizeCandidate(candidate) {
|
|
1638
|
-
const lowered = candidate.toLowerCase();
|
|
1639
|
-
const normalizedText = lowered.replace(/[^\w\s]/g, " ");
|
|
1640
|
-
const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
|
|
1641
|
-
return { normalizedText, words };
|
|
1642
|
-
}
|
|
1643
|
-
function matchesAspect(aspect, candidateNormalized, candidateWords) {
|
|
1644
|
-
const keyTerms = extractKeyTerms(aspect);
|
|
1645
|
-
if (keyTerms.length === 0) {
|
|
1646
|
-
return false;
|
|
1647
|
-
}
|
|
1648
|
-
const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
|
|
1649
|
-
const ratio = matches / keyTerms.length;
|
|
1650
|
-
if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
|
|
1651
|
-
return true;
|
|
1652
|
-
}
|
|
1653
|
-
const aspectWords = aspect.split(" ");
|
|
1654
|
-
if (aspectWords.length >= 2) {
|
|
1655
|
-
for (let index = 0; index < aspectWords.length - 1; index += 1) {
|
|
1656
|
-
const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
|
|
1657
|
-
if (candidateNormalized.includes(phrase)) {
|
|
1658
|
-
return true;
|
|
1659
|
-
}
|
|
1660
|
-
}
|
|
1661
|
-
}
|
|
1662
|
-
return false;
|
|
1663
|
-
}
|
|
1664
|
-
function extractKeyTerms(aspect, maxTerms = 5) {
|
|
1665
|
-
const terms = [];
|
|
1666
|
-
const words = aspect.split(" ");
|
|
1667
|
-
for (const word of words) {
|
|
1668
|
-
if (word.length <= 2) {
|
|
1669
|
-
continue;
|
|
1670
|
-
}
|
|
1671
|
-
if (STOP_WORDS.has(word)) {
|
|
1672
|
-
continue;
|
|
1673
|
-
}
|
|
1674
|
-
terms.push(word);
|
|
1675
|
-
if (terms.length >= maxTerms) {
|
|
1676
|
-
break;
|
|
1677
|
-
}
|
|
1678
|
-
}
|
|
1679
|
-
return terms;
|
|
1680
|
-
}
|
|
1681
|
-
|
|
1682
|
-
// src/evaluation/grading.ts
|
|
2552
|
+
// src/evaluation/evaluators.ts
|
|
1683
2553
|
var import_node_crypto = require("crypto");
|
|
1684
|
-
var
|
|
1685
|
-
kind = "heuristic";
|
|
1686
|
-
grade(context) {
|
|
1687
|
-
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1688
|
-
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1689
|
-
const misses = [...result.misses];
|
|
1690
|
-
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
1691
|
-
const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
|
|
1692
|
-
if (firstLine && !misses.includes(firstLine)) {
|
|
1693
|
-
misses.unshift(firstLine);
|
|
1694
|
-
}
|
|
1695
|
-
}
|
|
1696
|
-
return {
|
|
1697
|
-
score: result.score,
|
|
1698
|
-
hits: result.hits,
|
|
1699
|
-
misses,
|
|
1700
|
-
expectedAspectCount: result.totalAspects,
|
|
1701
|
-
rawAspects: result.rawAspects
|
|
1702
|
-
};
|
|
1703
|
-
}
|
|
1704
|
-
};
|
|
1705
|
-
var QualityGrader = class {
|
|
2554
|
+
var LlmJudgeEvaluator = class {
|
|
1706
2555
|
kind = "llm_judge";
|
|
1707
2556
|
resolveJudgeProvider;
|
|
1708
2557
|
maxOutputTokens;
|
|
1709
2558
|
temperature;
|
|
2559
|
+
customPrompt;
|
|
1710
2560
|
constructor(options) {
|
|
1711
2561
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
1712
2562
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
1713
2563
|
this.temperature = options.temperature;
|
|
2564
|
+
this.customPrompt = options.customPrompt;
|
|
1714
2565
|
}
|
|
1715
|
-
async
|
|
2566
|
+
async evaluate(context) {
|
|
1716
2567
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
1717
2568
|
if (!judgeProvider) {
|
|
1718
2569
|
throw new Error("No judge provider available for LLM grading");
|
|
1719
2570
|
}
|
|
1720
2571
|
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2572
|
+
const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
1721
2573
|
const metadata = {
|
|
1722
|
-
systemPrompt:
|
|
2574
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2575
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1723
2576
|
};
|
|
1724
2577
|
const response = await judgeProvider.invoke({
|
|
1725
2578
|
prompt,
|
|
@@ -1734,12 +2587,13 @@ var QualityGrader = class {
|
|
|
1734
2587
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1735
2588
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1736
2589
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
1737
|
-
const
|
|
2590
|
+
const evaluatorRawRequest = {
|
|
1738
2591
|
id: (0, import_node_crypto.randomUUID)(),
|
|
1739
2592
|
provider: judgeProvider.id,
|
|
1740
2593
|
prompt,
|
|
1741
|
-
|
|
1742
|
-
|
|
2594
|
+
target: context.target.name,
|
|
2595
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2596
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1743
2597
|
};
|
|
1744
2598
|
return {
|
|
1745
2599
|
score,
|
|
@@ -1747,7 +2601,7 @@ var QualityGrader = class {
|
|
|
1747
2601
|
misses,
|
|
1748
2602
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
1749
2603
|
reasoning,
|
|
1750
|
-
|
|
2604
|
+
evaluatorRawRequest
|
|
1751
2605
|
};
|
|
1752
2606
|
}
|
|
1753
2607
|
};
|
|
@@ -1865,11 +2719,117 @@ function extractJsonBlob(text) {
|
|
|
1865
2719
|
function isNonEmptyString(value) {
|
|
1866
2720
|
return typeof value === "string" && value.trim().length > 0;
|
|
1867
2721
|
}
|
|
2722
|
+
var CodeEvaluator = class {
|
|
2723
|
+
kind = "code";
|
|
2724
|
+
script;
|
|
2725
|
+
cwd;
|
|
2726
|
+
agentTimeoutMs;
|
|
2727
|
+
constructor(options) {
|
|
2728
|
+
this.script = options.script;
|
|
2729
|
+
this.cwd = options.cwd;
|
|
2730
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
2731
|
+
}
|
|
2732
|
+
async evaluate(context) {
|
|
2733
|
+
const inputPayload = JSON.stringify(
|
|
2734
|
+
{
|
|
2735
|
+
task: context.evalCase.task,
|
|
2736
|
+
outcome: context.evalCase.outcome,
|
|
2737
|
+
expected: context.evalCase.expected_assistant_raw,
|
|
2738
|
+
output: context.candidate,
|
|
2739
|
+
system_message: context.promptInputs.systemMessage ?? "",
|
|
2740
|
+
guideline_paths: context.evalCase.guideline_paths,
|
|
2741
|
+
attachments: context.evalCase.file_paths,
|
|
2742
|
+
user_segments: context.evalCase.user_segments
|
|
2743
|
+
},
|
|
2744
|
+
null,
|
|
2745
|
+
2
|
|
2746
|
+
);
|
|
2747
|
+
try {
|
|
2748
|
+
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
2749
|
+
const parsed = parseJsonSafe(stdout);
|
|
2750
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
2751
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
2752
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
2753
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
2754
|
+
return {
|
|
2755
|
+
score,
|
|
2756
|
+
hits,
|
|
2757
|
+
misses,
|
|
2758
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
2759
|
+
reasoning,
|
|
2760
|
+
evaluatorRawRequest: {
|
|
2761
|
+
script: this.script,
|
|
2762
|
+
...this.cwd ? { cwd: this.cwd } : {}
|
|
2763
|
+
}
|
|
2764
|
+
};
|
|
2765
|
+
} catch (error) {
|
|
2766
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2767
|
+
return {
|
|
2768
|
+
score: 0,
|
|
2769
|
+
hits: [],
|
|
2770
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
2771
|
+
expectedAspectCount: 1,
|
|
2772
|
+
reasoning: message,
|
|
2773
|
+
evaluatorRawRequest: {
|
|
2774
|
+
script: this.script,
|
|
2775
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
2776
|
+
error: message
|
|
2777
|
+
}
|
|
2778
|
+
};
|
|
2779
|
+
}
|
|
2780
|
+
}
|
|
2781
|
+
};
|
|
2782
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
2783
|
+
const { spawn: spawn2 } = await import("child_process");
|
|
2784
|
+
return await new Promise((resolve, reject) => {
|
|
2785
|
+
const child = spawn2(scriptPath, {
|
|
2786
|
+
shell: true,
|
|
2787
|
+
cwd
|
|
2788
|
+
});
|
|
2789
|
+
let stdout = "";
|
|
2790
|
+
let stderr = "";
|
|
2791
|
+
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
2792
|
+
child.kill();
|
|
2793
|
+
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
2794
|
+
}, agentTimeoutMs) : void 0;
|
|
2795
|
+
child.stdout?.on("data", (data) => {
|
|
2796
|
+
stdout += data.toString();
|
|
2797
|
+
});
|
|
2798
|
+
child.stderr?.on("data", (data) => {
|
|
2799
|
+
stderr += data.toString();
|
|
2800
|
+
});
|
|
2801
|
+
child.on("error", (error) => {
|
|
2802
|
+
if (timeout !== void 0) {
|
|
2803
|
+
clearTimeout(timeout);
|
|
2804
|
+
}
|
|
2805
|
+
reject(error);
|
|
2806
|
+
});
|
|
2807
|
+
child.on("exit", (code) => {
|
|
2808
|
+
if (timeout !== void 0) {
|
|
2809
|
+
clearTimeout(timeout);
|
|
2810
|
+
}
|
|
2811
|
+
if (code && code !== 0 && stderr.length > 0) {
|
|
2812
|
+
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
2813
|
+
return;
|
|
2814
|
+
}
|
|
2815
|
+
resolve(stdout.trim());
|
|
2816
|
+
});
|
|
2817
|
+
child.stdin?.write(input);
|
|
2818
|
+
child.stdin?.end();
|
|
2819
|
+
});
|
|
2820
|
+
}
|
|
2821
|
+
function parseJsonSafe(payload) {
|
|
2822
|
+
try {
|
|
2823
|
+
return JSON.parse(payload);
|
|
2824
|
+
} catch {
|
|
2825
|
+
return void 0;
|
|
2826
|
+
}
|
|
2827
|
+
}
|
|
1868
2828
|
|
|
1869
2829
|
// src/evaluation/orchestrator.ts
|
|
1870
2830
|
var import_node_crypto2 = require("crypto");
|
|
1871
|
-
var
|
|
1872
|
-
var
|
|
2831
|
+
var import_promises6 = require("fs/promises");
|
|
2832
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
1873
2833
|
|
|
1874
2834
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
1875
2835
|
var Node = class {
|
|
@@ -2016,7 +2976,7 @@ async function runEvaluation(options) {
|
|
|
2016
2976
|
targets,
|
|
2017
2977
|
env,
|
|
2018
2978
|
providerFactory,
|
|
2019
|
-
|
|
2979
|
+
evaluators,
|
|
2020
2980
|
maxRetries,
|
|
2021
2981
|
agentTimeoutMs,
|
|
2022
2982
|
promptDumpDir,
|
|
@@ -2075,7 +3035,7 @@ async function runEvaluation(options) {
|
|
|
2075
3035
|
}
|
|
2076
3036
|
return getOrCreateProvider(resolvedJudge);
|
|
2077
3037
|
};
|
|
2078
|
-
const
|
|
3038
|
+
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
2079
3039
|
const primaryProvider = getOrCreateProvider(target);
|
|
2080
3040
|
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
2081
3041
|
if (target.providerBatching && !providerSupportsBatch && verbose) {
|
|
@@ -2098,13 +3058,14 @@ async function runEvaluation(options) {
|
|
|
2098
3058
|
evalCases: filteredEvalCases,
|
|
2099
3059
|
provider: primaryProvider,
|
|
2100
3060
|
target,
|
|
2101
|
-
|
|
3061
|
+
evaluatorRegistry,
|
|
2102
3062
|
promptDumpDir,
|
|
2103
3063
|
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
2104
3064
|
onProgress,
|
|
2105
3065
|
onResult,
|
|
2106
3066
|
verbose,
|
|
2107
|
-
resolveJudgeProvider
|
|
3067
|
+
resolveJudgeProvider,
|
|
3068
|
+
agentTimeoutMs
|
|
2108
3069
|
});
|
|
2109
3070
|
} catch (error) {
|
|
2110
3071
|
if (verbose) {
|
|
@@ -2135,7 +3096,7 @@ async function runEvaluation(options) {
|
|
|
2135
3096
|
evalCase,
|
|
2136
3097
|
provider: primaryProvider,
|
|
2137
3098
|
target,
|
|
2138
|
-
|
|
3099
|
+
evaluators: evaluatorRegistry,
|
|
2139
3100
|
maxRetries,
|
|
2140
3101
|
agentTimeoutMs,
|
|
2141
3102
|
promptDumpDir,
|
|
@@ -2201,12 +3162,13 @@ async function runBatchEvaluation(options) {
|
|
|
2201
3162
|
evalCases,
|
|
2202
3163
|
provider,
|
|
2203
3164
|
target,
|
|
2204
|
-
|
|
3165
|
+
evaluatorRegistry,
|
|
2205
3166
|
promptDumpDir,
|
|
2206
3167
|
nowFn,
|
|
2207
3168
|
onProgress,
|
|
2208
3169
|
onResult,
|
|
2209
|
-
resolveJudgeProvider
|
|
3170
|
+
resolveJudgeProvider,
|
|
3171
|
+
agentTimeoutMs
|
|
2210
3172
|
} = options;
|
|
2211
3173
|
const promptInputsList = [];
|
|
2212
3174
|
for (const evalCase of evalCases) {
|
|
@@ -2222,7 +3184,7 @@ async function runBatchEvaluation(options) {
|
|
|
2222
3184
|
prompt: promptInputs.request,
|
|
2223
3185
|
guidelines: promptInputs.guidelines,
|
|
2224
3186
|
guideline_patterns: evalCase.guideline_patterns,
|
|
2225
|
-
|
|
3187
|
+
inputFiles: evalCase.file_paths,
|
|
2226
3188
|
evalCaseId: evalCase.id,
|
|
2227
3189
|
metadata: {
|
|
2228
3190
|
systemPrompt: promptInputs.systemMessage ?? ""
|
|
@@ -2254,23 +3216,19 @@ async function runBatchEvaluation(options) {
|
|
|
2254
3216
|
const evalCase = evalCases[i];
|
|
2255
3217
|
const promptInputs = promptInputsList[i];
|
|
2256
3218
|
const providerResponse = batchResponse[i];
|
|
2257
|
-
|
|
2258
|
-
const graderKind = evalCase.grader ?? "heuristic";
|
|
2259
|
-
const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
|
|
2260
|
-
if (!activeGrader) {
|
|
2261
|
-
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2262
|
-
}
|
|
2263
|
-
let grade;
|
|
3219
|
+
let result;
|
|
2264
3220
|
try {
|
|
2265
|
-
|
|
3221
|
+
result = await evaluateCandidate({
|
|
2266
3222
|
evalCase,
|
|
2267
3223
|
candidate: providerResponse.text ?? "",
|
|
2268
3224
|
target,
|
|
2269
3225
|
provider,
|
|
2270
|
-
|
|
3226
|
+
evaluators: evaluatorRegistry,
|
|
2271
3227
|
promptInputs,
|
|
2272
|
-
|
|
2273
|
-
|
|
3228
|
+
nowFn,
|
|
3229
|
+
attempt: 0,
|
|
3230
|
+
judgeProvider: await resolveJudgeProvider(target),
|
|
3231
|
+
agentTimeoutMs
|
|
2274
3232
|
});
|
|
2275
3233
|
} catch (error) {
|
|
2276
3234
|
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
@@ -2289,28 +3247,6 @@ async function runBatchEvaluation(options) {
|
|
|
2289
3247
|
}
|
|
2290
3248
|
continue;
|
|
2291
3249
|
}
|
|
2292
|
-
const completedAt = nowFn();
|
|
2293
|
-
const rawRequest = {
|
|
2294
|
-
request: promptInputs.request,
|
|
2295
|
-
guidelines: promptInputs.guidelines,
|
|
2296
|
-
guideline_paths: evalCase.guideline_paths,
|
|
2297
|
-
system_message: promptInputs.systemMessage ?? ""
|
|
2298
|
-
};
|
|
2299
|
-
const result = {
|
|
2300
|
-
eval_id: evalCase.id,
|
|
2301
|
-
conversation_id: evalCase.conversation_id,
|
|
2302
|
-
score: grade.score,
|
|
2303
|
-
hits: grade.hits,
|
|
2304
|
-
misses: grade.misses,
|
|
2305
|
-
model_answer: providerResponse.text ?? "",
|
|
2306
|
-
expected_aspect_count: grade.expectedAspectCount,
|
|
2307
|
-
target: target.name,
|
|
2308
|
-
timestamp: completedAt.toISOString(),
|
|
2309
|
-
reasoning: grade.reasoning,
|
|
2310
|
-
raw_aspects: grade.rawAspects,
|
|
2311
|
-
raw_request: rawRequest,
|
|
2312
|
-
grader_raw_request: grade.graderRawRequest
|
|
2313
|
-
};
|
|
2314
3250
|
results.push(result);
|
|
2315
3251
|
if (onResult) {
|
|
2316
3252
|
await onResult(result);
|
|
@@ -2332,7 +3268,7 @@ async function runEvalCase(options) {
|
|
|
2332
3268
|
evalCase,
|
|
2333
3269
|
provider,
|
|
2334
3270
|
target,
|
|
2335
|
-
|
|
3271
|
+
evaluators,
|
|
2336
3272
|
now,
|
|
2337
3273
|
maxRetries,
|
|
2338
3274
|
agentTimeoutMs,
|
|
@@ -2387,27 +3323,49 @@ async function runEvalCase(options) {
|
|
|
2387
3323
|
if (cacheKey && cache && !cachedResponse) {
|
|
2388
3324
|
await cache.set(cacheKey, providerResponse);
|
|
2389
3325
|
}
|
|
2390
|
-
const graderKind = evalCase.grader ?? "heuristic";
|
|
2391
|
-
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
2392
|
-
if (!activeGrader) {
|
|
2393
|
-
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2394
|
-
}
|
|
2395
|
-
let grade;
|
|
2396
3326
|
try {
|
|
2397
|
-
|
|
2398
|
-
grade = await activeGrader.grade({
|
|
3327
|
+
return await evaluateCandidate({
|
|
2399
3328
|
evalCase,
|
|
2400
3329
|
candidate: providerResponse.text ?? "",
|
|
2401
3330
|
target,
|
|
2402
3331
|
provider,
|
|
2403
|
-
|
|
3332
|
+
evaluators,
|
|
2404
3333
|
promptInputs,
|
|
2405
|
-
|
|
2406
|
-
|
|
3334
|
+
nowFn,
|
|
3335
|
+
attempt,
|
|
3336
|
+
judgeProvider,
|
|
3337
|
+
agentTimeoutMs
|
|
2407
3338
|
});
|
|
2408
3339
|
} catch (error) {
|
|
2409
3340
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2410
3341
|
}
|
|
3342
|
+
}
|
|
3343
|
+
async function evaluateCandidate(options) {
|
|
3344
|
+
const {
|
|
3345
|
+
evalCase,
|
|
3346
|
+
candidate,
|
|
3347
|
+
target,
|
|
3348
|
+
provider,
|
|
3349
|
+
evaluators,
|
|
3350
|
+
promptInputs,
|
|
3351
|
+
nowFn,
|
|
3352
|
+
attempt,
|
|
3353
|
+
judgeProvider,
|
|
3354
|
+
agentTimeoutMs
|
|
3355
|
+
} = options;
|
|
3356
|
+
const gradeTimestamp = nowFn();
|
|
3357
|
+
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
3358
|
+
evalCase,
|
|
3359
|
+
candidate,
|
|
3360
|
+
target,
|
|
3361
|
+
provider,
|
|
3362
|
+
evaluators,
|
|
3363
|
+
attempt,
|
|
3364
|
+
promptInputs,
|
|
3365
|
+
now: gradeTimestamp,
|
|
3366
|
+
judgeProvider,
|
|
3367
|
+
agentTimeoutMs
|
|
3368
|
+
});
|
|
2411
3369
|
const completedAt = nowFn();
|
|
2412
3370
|
const rawRequest = {
|
|
2413
3371
|
request: promptInputs.request,
|
|
@@ -2418,28 +3376,200 @@ async function runEvalCase(options) {
|
|
|
2418
3376
|
return {
|
|
2419
3377
|
eval_id: evalCase.id,
|
|
2420
3378
|
conversation_id: evalCase.conversation_id,
|
|
2421
|
-
score:
|
|
2422
|
-
hits:
|
|
2423
|
-
misses:
|
|
2424
|
-
model_answer:
|
|
2425
|
-
expected_aspect_count:
|
|
3379
|
+
score: score.score,
|
|
3380
|
+
hits: score.hits,
|
|
3381
|
+
misses: score.misses,
|
|
3382
|
+
model_answer: candidate,
|
|
3383
|
+
expected_aspect_count: score.expectedAspectCount,
|
|
2426
3384
|
target: target.name,
|
|
2427
3385
|
timestamp: completedAt.toISOString(),
|
|
2428
|
-
reasoning:
|
|
2429
|
-
raw_aspects:
|
|
3386
|
+
reasoning: score.reasoning,
|
|
3387
|
+
raw_aspects: score.rawAspects,
|
|
2430
3388
|
raw_request: rawRequest,
|
|
2431
|
-
|
|
3389
|
+
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3390
|
+
evaluator_results: evaluatorResults
|
|
2432
3391
|
};
|
|
2433
3392
|
}
|
|
3393
|
+
async function runEvaluatorsForCase(options) {
|
|
3394
|
+
const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
|
|
3395
|
+
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
3396
|
+
return runEvaluatorList({
|
|
3397
|
+
evalCase,
|
|
3398
|
+
evaluators: evalCase.evaluators,
|
|
3399
|
+
candidate,
|
|
3400
|
+
target,
|
|
3401
|
+
provider,
|
|
3402
|
+
evaluatorRegistry: evaluators,
|
|
3403
|
+
attempt,
|
|
3404
|
+
promptInputs,
|
|
3405
|
+
now,
|
|
3406
|
+
judgeProvider,
|
|
3407
|
+
agentTimeoutMs
|
|
3408
|
+
});
|
|
3409
|
+
}
|
|
3410
|
+
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
3411
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
|
|
3412
|
+
if (!activeEvaluator) {
|
|
3413
|
+
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
3414
|
+
}
|
|
3415
|
+
const score = await activeEvaluator.evaluate({
|
|
3416
|
+
evalCase,
|
|
3417
|
+
candidate,
|
|
3418
|
+
target,
|
|
3419
|
+
provider,
|
|
3420
|
+
attempt,
|
|
3421
|
+
promptInputs,
|
|
3422
|
+
now,
|
|
3423
|
+
judgeProvider
|
|
3424
|
+
});
|
|
3425
|
+
return { score };
|
|
3426
|
+
}
|
|
3427
|
+
async function runEvaluatorList(options) {
|
|
3428
|
+
const {
|
|
3429
|
+
evalCase,
|
|
3430
|
+
evaluators,
|
|
3431
|
+
candidate,
|
|
3432
|
+
target,
|
|
3433
|
+
provider,
|
|
3434
|
+
evaluatorRegistry,
|
|
3435
|
+
attempt,
|
|
3436
|
+
promptInputs,
|
|
3437
|
+
now,
|
|
3438
|
+
judgeProvider,
|
|
3439
|
+
agentTimeoutMs
|
|
3440
|
+
} = options;
|
|
3441
|
+
const scored = [];
|
|
3442
|
+
const evaluatorResults = [];
|
|
3443
|
+
for (const evaluator of evaluators ?? []) {
|
|
3444
|
+
try {
|
|
3445
|
+
if (evaluator.type === "llm_judge") {
|
|
3446
|
+
const score2 = await runLlmJudgeEvaluator({
|
|
3447
|
+
config: evaluator,
|
|
3448
|
+
evalCase,
|
|
3449
|
+
candidate,
|
|
3450
|
+
target,
|
|
3451
|
+
provider,
|
|
3452
|
+
evaluatorRegistry,
|
|
3453
|
+
attempt,
|
|
3454
|
+
promptInputs,
|
|
3455
|
+
now,
|
|
3456
|
+
judgeProvider
|
|
3457
|
+
});
|
|
3458
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3459
|
+
evaluatorResults.push({
|
|
3460
|
+
name: evaluator.name,
|
|
3461
|
+
type: evaluator.type,
|
|
3462
|
+
score: score2.score,
|
|
3463
|
+
hits: score2.hits,
|
|
3464
|
+
misses: score2.misses,
|
|
3465
|
+
reasoning: score2.reasoning,
|
|
3466
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3467
|
+
});
|
|
3468
|
+
continue;
|
|
3469
|
+
}
|
|
3470
|
+
if (evaluator.type === "code") {
|
|
3471
|
+
const codeEvaluator = new CodeEvaluator({
|
|
3472
|
+
script: evaluator.script,
|
|
3473
|
+
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
3474
|
+
agentTimeoutMs
|
|
3475
|
+
});
|
|
3476
|
+
const score2 = await codeEvaluator.evaluate({
|
|
3477
|
+
evalCase,
|
|
3478
|
+
candidate,
|
|
3479
|
+
target,
|
|
3480
|
+
provider,
|
|
3481
|
+
attempt,
|
|
3482
|
+
promptInputs,
|
|
3483
|
+
now
|
|
3484
|
+
});
|
|
3485
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3486
|
+
evaluatorResults.push({
|
|
3487
|
+
name: evaluator.name,
|
|
3488
|
+
type: evaluator.type,
|
|
3489
|
+
score: score2.score,
|
|
3490
|
+
hits: score2.hits,
|
|
3491
|
+
misses: score2.misses,
|
|
3492
|
+
reasoning: score2.reasoning,
|
|
3493
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3494
|
+
});
|
|
3495
|
+
continue;
|
|
3496
|
+
}
|
|
3497
|
+
} catch (error) {
|
|
3498
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3499
|
+
const fallbackScore = {
|
|
3500
|
+
score: 0,
|
|
3501
|
+
hits: [],
|
|
3502
|
+
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
3503
|
+
expectedAspectCount: 1,
|
|
3504
|
+
reasoning: message
|
|
3505
|
+
};
|
|
3506
|
+
scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
|
|
3507
|
+
evaluatorResults.push({
|
|
3508
|
+
name: evaluator.name ?? "unknown",
|
|
3509
|
+
type: evaluator.type ?? "unknown",
|
|
3510
|
+
score: 0,
|
|
3511
|
+
hits: [],
|
|
3512
|
+
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
3513
|
+
reasoning: message
|
|
3514
|
+
});
|
|
3515
|
+
}
|
|
3516
|
+
}
|
|
3517
|
+
const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
|
|
3518
|
+
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
3519
|
+
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
3520
|
+
const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
|
|
3521
|
+
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
3522
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
3523
|
+
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
3524
|
+
const score = {
|
|
3525
|
+
score: aggregateScore,
|
|
3526
|
+
hits,
|
|
3527
|
+
misses,
|
|
3528
|
+
expectedAspectCount,
|
|
3529
|
+
reasoning,
|
|
3530
|
+
rawAspects: rawAspects.length > 0 ? rawAspects : void 0
|
|
3531
|
+
};
|
|
3532
|
+
return { score, evaluatorResults };
|
|
3533
|
+
}
|
|
3534
|
+
async function runLlmJudgeEvaluator(options) {
|
|
3535
|
+
const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
|
|
3536
|
+
const customPrompt = await resolveCustomPrompt(config);
|
|
3537
|
+
return evaluatorRegistry.llm_judge.evaluate({
|
|
3538
|
+
evalCase,
|
|
3539
|
+
candidate,
|
|
3540
|
+
target,
|
|
3541
|
+
provider,
|
|
3542
|
+
attempt,
|
|
3543
|
+
promptInputs,
|
|
3544
|
+
now,
|
|
3545
|
+
judgeProvider,
|
|
3546
|
+
systemPrompt: customPrompt,
|
|
3547
|
+
evaluator: config,
|
|
3548
|
+
judgeModel: config.model
|
|
3549
|
+
});
|
|
3550
|
+
}
|
|
3551
|
+
async function resolveCustomPrompt(config) {
|
|
3552
|
+
if (config.promptPath) {
|
|
3553
|
+
try {
|
|
3554
|
+
return await (0, import_promises6.readFile)(config.promptPath, "utf8");
|
|
3555
|
+
} catch (error) {
|
|
3556
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3557
|
+
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
3558
|
+
}
|
|
3559
|
+
}
|
|
3560
|
+
return config.prompt;
|
|
3561
|
+
}
|
|
3562
|
+
function isNonEmptyString2(value) {
|
|
3563
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
3564
|
+
}
|
|
2434
3565
|
function filterEvalCases(evalCases, evalId) {
|
|
2435
3566
|
if (!evalId) {
|
|
2436
3567
|
return evalCases;
|
|
2437
3568
|
}
|
|
2438
3569
|
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
2439
3570
|
}
|
|
2440
|
-
function
|
|
2441
|
-
const
|
|
2442
|
-
const llmJudge = overrides?.llm_judge ?? new QualityGrader({
|
|
3571
|
+
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
3572
|
+
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
2443
3573
|
resolveJudgeProvider: async (context) => {
|
|
2444
3574
|
if (context.judgeProvider) {
|
|
2445
3575
|
return context.judgeProvider;
|
|
@@ -2449,22 +3579,21 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
2449
3579
|
});
|
|
2450
3580
|
return {
|
|
2451
3581
|
...overrides,
|
|
2452
|
-
heuristic,
|
|
2453
3582
|
llm_judge: llmJudge
|
|
2454
3583
|
};
|
|
2455
3584
|
}
|
|
2456
3585
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
2457
3586
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2458
3587
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
2459
|
-
const filePath =
|
|
2460
|
-
await (0,
|
|
3588
|
+
const filePath = import_node_path8.default.resolve(directory, filename);
|
|
3589
|
+
await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
|
|
2461
3590
|
const payload = {
|
|
2462
3591
|
eval_id: evalCase.id,
|
|
2463
3592
|
request: promptInputs.request,
|
|
2464
3593
|
guidelines: promptInputs.guidelines,
|
|
2465
3594
|
guideline_paths: evalCase.guideline_paths
|
|
2466
3595
|
};
|
|
2467
|
-
await (0,
|
|
3596
|
+
await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
2468
3597
|
}
|
|
2469
3598
|
function sanitizeFilename(value) {
|
|
2470
3599
|
if (!value) {
|
|
@@ -2474,7 +3603,7 @@ function sanitizeFilename(value) {
|
|
|
2474
3603
|
return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
|
|
2475
3604
|
}
|
|
2476
3605
|
async function invokeProvider(provider, options) {
|
|
2477
|
-
const { evalCase,
|
|
3606
|
+
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2478
3607
|
const controller = new AbortController();
|
|
2479
3608
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2480
3609
|
if (signal) {
|
|
@@ -2485,7 +3614,7 @@ async function invokeProvider(provider, options) {
|
|
|
2485
3614
|
prompt: promptInputs.request,
|
|
2486
3615
|
guidelines: promptInputs.guidelines,
|
|
2487
3616
|
guideline_patterns: evalCase.guideline_patterns,
|
|
2488
|
-
|
|
3617
|
+
inputFiles: evalCase.file_paths,
|
|
2489
3618
|
evalCaseId: evalCase.id,
|
|
2490
3619
|
attempt,
|
|
2491
3620
|
metadata: {
|
|
@@ -2554,25 +3683,20 @@ function createAgentKernel() {
|
|
|
2554
3683
|
}
|
|
2555
3684
|
// Annotate the CommonJS export names for ESM import in node:
|
|
2556
3685
|
0 && (module.exports = {
|
|
2557
|
-
|
|
2558
|
-
|
|
2559
|
-
QualityGrader,
|
|
3686
|
+
CodeEvaluator,
|
|
3687
|
+
LlmJudgeEvaluator,
|
|
2560
3688
|
TEST_MESSAGE_ROLES,
|
|
2561
3689
|
buildDirectoryChain,
|
|
2562
3690
|
buildPromptInputs,
|
|
2563
3691
|
buildSearchRoots,
|
|
2564
|
-
calculateHits,
|
|
2565
|
-
calculateMisses,
|
|
2566
3692
|
createAgentKernel,
|
|
2567
3693
|
createProvider,
|
|
2568
3694
|
ensureVSCodeSubagents,
|
|
2569
|
-
extractAspects,
|
|
2570
3695
|
extractCodeBlocks,
|
|
2571
3696
|
fileExists,
|
|
2572
3697
|
findGitRoot,
|
|
2573
3698
|
getHitCount,
|
|
2574
|
-
|
|
2575
|
-
isGraderKind,
|
|
3699
|
+
isEvaluatorKind,
|
|
2576
3700
|
isGuidelineFile,
|
|
2577
3701
|
isJsonObject,
|
|
2578
3702
|
isJsonValue,
|
|
@@ -2585,7 +3709,6 @@ function createAgentKernel() {
|
|
|
2585
3709
|
resolveFileReference,
|
|
2586
3710
|
resolveTargetDefinition,
|
|
2587
3711
|
runEvalCase,
|
|
2588
|
-
runEvaluation
|
|
2589
|
-
scoreCandidateResponse
|
|
3712
|
+
runEvaluation
|
|
2590
3713
|
});
|
|
2591
3714
|
//# sourceMappingURL=index.cjs.map
|