@agentv/core 0.2.11 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-P4GOYWYH.js → chunk-NL7K4CAK.js} +5 -1
- package/dist/chunk-NL7K4CAK.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +186 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +183 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1482 -359
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +107 -63
- package/dist/index.d.ts +107 -63
- package/dist/index.js +1474 -350
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-P4GOYWYH.js.map +0 -1
- package/dist/chunk-XXNQA4EW.js +0 -140
- package/dist/chunk-XXNQA4EW.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
fileExists,
|
|
6
6
|
findGitRoot,
|
|
7
7
|
resolveFileReference
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-NL7K4CAK.js";
|
|
9
9
|
|
|
10
10
|
// src/evaluation/types.ts
|
|
11
11
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -48,11 +48,10 @@ function isTestMessage(value) {
|
|
|
48
48
|
}
|
|
49
49
|
return candidate.content.every(isJsonObject);
|
|
50
50
|
}
|
|
51
|
-
var
|
|
52
|
-
var
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return typeof value === "string" && GRADER_KIND_SET.has(value);
|
|
51
|
+
var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
|
|
52
|
+
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
53
|
+
function isEvaluatorKind(value) {
|
|
54
|
+
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
56
55
|
}
|
|
57
56
|
function getHitCount(result) {
|
|
58
57
|
return result.hits.length;
|
|
@@ -160,7 +159,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
160
159
|
if (!Array.isArray(rawTestcases)) {
|
|
161
160
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
162
161
|
}
|
|
163
|
-
const
|
|
162
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
164
163
|
const results = [];
|
|
165
164
|
for (const rawEvalcase of rawTestcases) {
|
|
166
165
|
if (!isJsonObject(rawEvalcase)) {
|
|
@@ -283,7 +282,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
283
282
|
const assistantContent = assistantMessages[0]?.content;
|
|
284
283
|
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
285
284
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
286
|
-
const
|
|
285
|
+
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
286
|
+
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
287
287
|
const userFilePaths = [];
|
|
288
288
|
for (const segment of userSegments) {
|
|
289
289
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -306,7 +306,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
306
306
|
file_paths: allFilePaths,
|
|
307
307
|
code_snippets: codeSnippets,
|
|
308
308
|
outcome,
|
|
309
|
-
|
|
309
|
+
evaluator: testCaseEvaluatorKind,
|
|
310
|
+
evaluators
|
|
310
311
|
};
|
|
311
312
|
if (verbose) {
|
|
312
313
|
console.log(`
|
|
@@ -467,14 +468,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
467
468
|
}
|
|
468
469
|
return parts.join(" ");
|
|
469
470
|
}
|
|
470
|
-
function
|
|
471
|
+
async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
472
|
+
const execution = rawEvalCase.execution;
|
|
473
|
+
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
|
|
474
|
+
if (candidateEvaluators === void 0) {
|
|
475
|
+
return void 0;
|
|
476
|
+
}
|
|
477
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
478
|
+
logWarning(`Skipping evaluators for '${evalId}': expected array`);
|
|
479
|
+
return void 0;
|
|
480
|
+
}
|
|
481
|
+
const evaluators = [];
|
|
482
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
483
|
+
if (!isJsonObject(rawEvaluator)) {
|
|
484
|
+
logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
485
|
+
continue;
|
|
486
|
+
}
|
|
487
|
+
const name = asString(rawEvaluator.name);
|
|
488
|
+
const typeValue = rawEvaluator.type;
|
|
489
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
490
|
+
logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
491
|
+
continue;
|
|
492
|
+
}
|
|
493
|
+
if (typeValue === "code") {
|
|
494
|
+
const script = asString(rawEvaluator.script);
|
|
495
|
+
if (!script) {
|
|
496
|
+
logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
497
|
+
continue;
|
|
498
|
+
}
|
|
499
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
500
|
+
let resolvedCwd;
|
|
501
|
+
if (cwd) {
|
|
502
|
+
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
503
|
+
if (resolved.resolvedPath) {
|
|
504
|
+
resolvedCwd = path.resolve(resolved.resolvedPath);
|
|
505
|
+
} else {
|
|
506
|
+
logWarning(
|
|
507
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
508
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
evaluators.push({
|
|
513
|
+
name,
|
|
514
|
+
type: "code",
|
|
515
|
+
script,
|
|
516
|
+
cwd,
|
|
517
|
+
resolvedCwd
|
|
518
|
+
});
|
|
519
|
+
continue;
|
|
520
|
+
}
|
|
521
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
522
|
+
let promptPath;
|
|
523
|
+
if (prompt) {
|
|
524
|
+
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
525
|
+
if (resolved.resolvedPath) {
|
|
526
|
+
promptPath = path.resolve(resolved.resolvedPath);
|
|
527
|
+
} else {
|
|
528
|
+
logWarning(
|
|
529
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
530
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
531
|
+
);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
const model = asString(rawEvaluator.model);
|
|
535
|
+
evaluators.push({
|
|
536
|
+
name,
|
|
537
|
+
type: "llm_judge",
|
|
538
|
+
prompt,
|
|
539
|
+
promptPath,
|
|
540
|
+
model
|
|
541
|
+
});
|
|
542
|
+
}
|
|
543
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
544
|
+
}
|
|
545
|
+
function coerceEvaluator(candidate, contextId) {
|
|
471
546
|
if (typeof candidate !== "string") {
|
|
472
547
|
return void 0;
|
|
473
548
|
}
|
|
474
|
-
if (
|
|
549
|
+
if (isEvaluatorKind(candidate)) {
|
|
475
550
|
return candidate;
|
|
476
551
|
}
|
|
477
|
-
logWarning(`Unknown
|
|
552
|
+
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
478
553
|
return void 0;
|
|
479
554
|
}
|
|
480
555
|
function logWarning(message, details) {
|
|
@@ -670,6 +745,790 @@ var GeminiProvider = class {
|
|
|
670
745
|
}
|
|
671
746
|
};
|
|
672
747
|
|
|
748
|
+
// src/evaluation/providers/cli.ts
|
|
749
|
+
import { exec as execWithCallback } from "node:child_process";
|
|
750
|
+
import path2 from "node:path";
|
|
751
|
+
import { promisify } from "node:util";
|
|
752
|
+
var execAsync = promisify(execWithCallback);
|
|
753
|
+
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
754
|
+
async function defaultCommandRunner(command, options) {
|
|
755
|
+
const execOptions = {
|
|
756
|
+
cwd: options.cwd,
|
|
757
|
+
env: options.env,
|
|
758
|
+
timeout: options.timeoutMs,
|
|
759
|
+
signal: options.signal,
|
|
760
|
+
maxBuffer: DEFAULT_MAX_BUFFER,
|
|
761
|
+
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
762
|
+
};
|
|
763
|
+
try {
|
|
764
|
+
const { stdout, stderr } = await execAsync(command, execOptions);
|
|
765
|
+
return {
|
|
766
|
+
stdout,
|
|
767
|
+
stderr,
|
|
768
|
+
exitCode: 0,
|
|
769
|
+
failed: false,
|
|
770
|
+
timedOut: false,
|
|
771
|
+
signal: null
|
|
772
|
+
};
|
|
773
|
+
} catch (error) {
|
|
774
|
+
const execError = error;
|
|
775
|
+
return {
|
|
776
|
+
stdout: execError.stdout ?? "",
|
|
777
|
+
stderr: execError.stderr ?? "",
|
|
778
|
+
exitCode: typeof execError.code === "number" ? execError.code : null,
|
|
779
|
+
failed: true,
|
|
780
|
+
timedOut: execError.timedOut === true || execError.killed === true,
|
|
781
|
+
signal: execError.signal ?? null
|
|
782
|
+
};
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
var CliProvider = class {
|
|
786
|
+
id;
|
|
787
|
+
kind = "cli";
|
|
788
|
+
targetName;
|
|
789
|
+
supportsBatch = false;
|
|
790
|
+
config;
|
|
791
|
+
runCommand;
|
|
792
|
+
healthcheckPromise;
|
|
793
|
+
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
794
|
+
this.targetName = targetName;
|
|
795
|
+
this.id = `cli:${targetName}`;
|
|
796
|
+
this.config = config;
|
|
797
|
+
this.runCommand = runner;
|
|
798
|
+
}
|
|
799
|
+
async invoke(request) {
|
|
800
|
+
if (request.signal?.aborted) {
|
|
801
|
+
throw new Error("CLI provider request was aborted before execution");
|
|
802
|
+
}
|
|
803
|
+
await this.ensureHealthy(request.signal);
|
|
804
|
+
const templateValues = buildTemplateValues(request, this.config);
|
|
805
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
806
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
807
|
+
const result = await this.runCommand(renderedCommand, {
|
|
808
|
+
cwd: this.config.cwd,
|
|
809
|
+
env,
|
|
810
|
+
timeoutMs: this.config.timeoutMs,
|
|
811
|
+
signal: request.signal
|
|
812
|
+
});
|
|
813
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
814
|
+
if (request.signal?.aborted) {
|
|
815
|
+
throw new Error("CLI provider request was aborted");
|
|
816
|
+
}
|
|
817
|
+
if (result.timedOut) {
|
|
818
|
+
throw new Error(
|
|
819
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
820
|
+
);
|
|
821
|
+
}
|
|
822
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
823
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
824
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
825
|
+
throw new Error(message);
|
|
826
|
+
}
|
|
827
|
+
return {
|
|
828
|
+
text: result.stdout,
|
|
829
|
+
raw: {
|
|
830
|
+
command: renderedCommand,
|
|
831
|
+
stderr: result.stderr,
|
|
832
|
+
exitCode: result.exitCode ?? 0,
|
|
833
|
+
cwd: this.config.cwd
|
|
834
|
+
}
|
|
835
|
+
};
|
|
836
|
+
}
|
|
837
|
+
async ensureHealthy(signal) {
|
|
838
|
+
if (!this.config.healthcheck) {
|
|
839
|
+
return;
|
|
840
|
+
}
|
|
841
|
+
if (!this.healthcheckPromise) {
|
|
842
|
+
this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
|
|
843
|
+
}
|
|
844
|
+
return this.healthcheckPromise;
|
|
845
|
+
}
|
|
846
|
+
async runHealthcheck(healthcheck, signal) {
|
|
847
|
+
if (!healthcheck) {
|
|
848
|
+
return;
|
|
849
|
+
}
|
|
850
|
+
const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
|
|
851
|
+
if (healthcheck.type === "http") {
|
|
852
|
+
const controller = new AbortController();
|
|
853
|
+
const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
|
|
854
|
+
signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
855
|
+
try {
|
|
856
|
+
const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
|
|
857
|
+
if (!response.ok) {
|
|
858
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}`);
|
|
859
|
+
}
|
|
860
|
+
} catch (error) {
|
|
861
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
862
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
|
|
863
|
+
} finally {
|
|
864
|
+
if (timer !== void 0) {
|
|
865
|
+
clearTimeout(timer);
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
return;
|
|
869
|
+
}
|
|
870
|
+
const renderedCommand = renderTemplate(
|
|
871
|
+
healthcheck.commandTemplate,
|
|
872
|
+
buildTemplateValues(
|
|
873
|
+
{
|
|
874
|
+
prompt: "",
|
|
875
|
+
guidelines: "",
|
|
876
|
+
inputFiles: [],
|
|
877
|
+
evalCaseId: "",
|
|
878
|
+
attempt: 0
|
|
879
|
+
},
|
|
880
|
+
this.config
|
|
881
|
+
)
|
|
882
|
+
);
|
|
883
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
884
|
+
const result = await this.runCommand(renderedCommand, {
|
|
885
|
+
cwd: healthcheck.cwd ?? this.config.cwd,
|
|
886
|
+
env,
|
|
887
|
+
timeoutMs,
|
|
888
|
+
signal
|
|
889
|
+
});
|
|
890
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
891
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
892
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
893
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
|
|
894
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
};
|
|
898
|
+
function buildTemplateValues(request, config) {
|
|
899
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
900
|
+
return {
|
|
901
|
+
PROMPT: shellEscape(request.prompt ?? ""),
|
|
902
|
+
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
903
|
+
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
904
|
+
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
905
|
+
FILES: formatFileList(inputFiles, config.filesFormat)
|
|
906
|
+
};
|
|
907
|
+
}
|
|
908
|
+
function normalizeInputFiles(inputFiles) {
|
|
909
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
910
|
+
return void 0;
|
|
911
|
+
}
|
|
912
|
+
const unique = /* @__PURE__ */ new Map();
|
|
913
|
+
for (const inputFile of inputFiles) {
|
|
914
|
+
const absolutePath = path2.resolve(inputFile);
|
|
915
|
+
if (!unique.has(absolutePath)) {
|
|
916
|
+
unique.set(absolutePath, absolutePath);
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
return Array.from(unique.values());
|
|
920
|
+
}
|
|
921
|
+
function formatFileList(files, template) {
|
|
922
|
+
if (!files || files.length === 0) {
|
|
923
|
+
return "";
|
|
924
|
+
}
|
|
925
|
+
const formatter = template ?? "{path}";
|
|
926
|
+
return files.map((filePath) => {
|
|
927
|
+
const escapedPath = shellEscape(filePath);
|
|
928
|
+
const escapedName = shellEscape(path2.basename(filePath));
|
|
929
|
+
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
930
|
+
}).join(" ");
|
|
931
|
+
}
|
|
932
|
+
function renderTemplate(template, values) {
|
|
933
|
+
return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
|
|
934
|
+
const replacement = values[key];
|
|
935
|
+
return replacement !== void 0 ? replacement : match;
|
|
936
|
+
});
|
|
937
|
+
}
|
|
938
|
+
function shellEscape(value) {
|
|
939
|
+
if (value.length === 0) {
|
|
940
|
+
return "''";
|
|
941
|
+
}
|
|
942
|
+
if (process.platform === "win32") {
|
|
943
|
+
const escaped = value.replace(/"/g, '\\"');
|
|
944
|
+
return `"${escaped}"`;
|
|
945
|
+
}
|
|
946
|
+
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
947
|
+
}
|
|
948
|
+
function formatTimeoutSuffix(timeoutMs) {
|
|
949
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
950
|
+
return "";
|
|
951
|
+
}
|
|
952
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
953
|
+
return ` after ${seconds}s`;
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
// src/evaluation/providers/codex.ts
|
|
957
|
+
import { exec as execCallback, spawn } from "node:child_process";
|
|
958
|
+
import { constants as constants2 } from "node:fs";
|
|
959
|
+
import { access as access2, copyFile, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
|
|
960
|
+
import { tmpdir } from "node:os";
|
|
961
|
+
import path4 from "node:path";
|
|
962
|
+
import { promisify as promisify2 } from "node:util";
|
|
963
|
+
|
|
964
|
+
// src/evaluation/providers/preread.ts
|
|
965
|
+
import path3 from "node:path";
|
|
966
|
+
function buildPromptDocument(request, inputFiles, options) {
|
|
967
|
+
const parts = [];
|
|
968
|
+
const guidelineFiles = collectGuidelineFiles(
|
|
969
|
+
inputFiles,
|
|
970
|
+
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
971
|
+
options?.guidelineOverrides
|
|
972
|
+
);
|
|
973
|
+
const inputFilesList = collectInputFiles(inputFiles);
|
|
974
|
+
const nonGuidelineInputFiles = inputFilesList.filter(
|
|
975
|
+
(file) => !guidelineFiles.includes(file)
|
|
976
|
+
);
|
|
977
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
978
|
+
if (prereadBlock.length > 0) {
|
|
979
|
+
parts.push("\n", prereadBlock);
|
|
980
|
+
}
|
|
981
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
982
|
+
return parts.join("\n").trim();
|
|
983
|
+
}
|
|
984
|
+
function normalizeInputFiles2(inputFiles) {
|
|
985
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
986
|
+
return void 0;
|
|
987
|
+
}
|
|
988
|
+
const deduped = /* @__PURE__ */ new Map();
|
|
989
|
+
for (const inputFile of inputFiles) {
|
|
990
|
+
const absolutePath = path3.resolve(inputFile);
|
|
991
|
+
if (!deduped.has(absolutePath)) {
|
|
992
|
+
deduped.set(absolutePath, absolutePath);
|
|
993
|
+
}
|
|
994
|
+
}
|
|
995
|
+
return Array.from(deduped.values());
|
|
996
|
+
}
|
|
997
|
+
function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
998
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
999
|
+
return [];
|
|
1000
|
+
}
|
|
1001
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1002
|
+
for (const inputFile of inputFiles) {
|
|
1003
|
+
const absolutePath = path3.resolve(inputFile);
|
|
1004
|
+
if (overrides?.has(absolutePath)) {
|
|
1005
|
+
if (!unique.has(absolutePath)) {
|
|
1006
|
+
unique.set(absolutePath, absolutePath);
|
|
1007
|
+
}
|
|
1008
|
+
continue;
|
|
1009
|
+
}
|
|
1010
|
+
const normalized = absolutePath.split(path3.sep).join("/");
|
|
1011
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1012
|
+
if (!unique.has(absolutePath)) {
|
|
1013
|
+
unique.set(absolutePath, absolutePath);
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
return Array.from(unique.values());
|
|
1018
|
+
}
|
|
1019
|
+
function collectInputFiles(inputFiles) {
|
|
1020
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1021
|
+
return [];
|
|
1022
|
+
}
|
|
1023
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1024
|
+
for (const inputFile of inputFiles) {
|
|
1025
|
+
const absolutePath = path3.resolve(inputFile);
|
|
1026
|
+
if (!unique.has(absolutePath)) {
|
|
1027
|
+
unique.set(absolutePath, absolutePath);
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
return Array.from(unique.values());
|
|
1031
|
+
}
|
|
1032
|
+
function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
1033
|
+
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
1034
|
+
return "";
|
|
1035
|
+
}
|
|
1036
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1037
|
+
const fileName = path3.basename(absolutePath);
|
|
1038
|
+
const fileUri = pathToFileUri(absolutePath);
|
|
1039
|
+
return `* [${fileName}](${fileUri})`;
|
|
1040
|
+
});
|
|
1041
|
+
const sections = [];
|
|
1042
|
+
if (guidelineFiles.length > 0) {
|
|
1043
|
+
sections.push(`Read all guideline files:
|
|
1044
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1045
|
+
}
|
|
1046
|
+
if (inputFiles.length > 0) {
|
|
1047
|
+
sections.push(`Read all input files:
|
|
1048
|
+
${buildList(inputFiles).join("\n")}.`);
|
|
1049
|
+
}
|
|
1050
|
+
sections.push(
|
|
1051
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1052
|
+
"Then apply system_instructions on the user query below."
|
|
1053
|
+
);
|
|
1054
|
+
return sections.join("\n");
|
|
1055
|
+
}
|
|
1056
|
+
function pathToFileUri(filePath) {
|
|
1057
|
+
const absolutePath = path3.isAbsolute(filePath) ? filePath : path3.resolve(filePath);
|
|
1058
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1059
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1060
|
+
return `file:///${normalizedPath}`;
|
|
1061
|
+
}
|
|
1062
|
+
return `file://${normalizedPath}`;
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
// src/evaluation/providers/codex.ts
|
|
1066
|
+
var execAsync2 = promisify2(execCallback);
|
|
1067
|
+
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
1068
|
+
var PROMPT_FILENAME = "prompt.md";
|
|
1069
|
+
var FILES_DIR = "files";
|
|
1070
|
+
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
1071
|
+
var CodexProvider = class {
|
|
1072
|
+
id;
|
|
1073
|
+
kind = "codex";
|
|
1074
|
+
targetName;
|
|
1075
|
+
supportsBatch = false;
|
|
1076
|
+
config;
|
|
1077
|
+
runCodex;
|
|
1078
|
+
environmentCheck;
|
|
1079
|
+
resolvedExecutable;
|
|
1080
|
+
constructor(targetName, config, runner = defaultCodexRunner) {
|
|
1081
|
+
this.id = `codex:${targetName}`;
|
|
1082
|
+
this.targetName = targetName;
|
|
1083
|
+
this.config = config;
|
|
1084
|
+
this.runCodex = runner;
|
|
1085
|
+
}
|
|
1086
|
+
async invoke(request) {
|
|
1087
|
+
if (request.signal?.aborted) {
|
|
1088
|
+
throw new Error("Codex provider request was aborted before execution");
|
|
1089
|
+
}
|
|
1090
|
+
await this.ensureEnvironmentReady();
|
|
1091
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
1092
|
+
const originalGuidelines = new Set(
|
|
1093
|
+
collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => path4.resolve(file))
|
|
1094
|
+
);
|
|
1095
|
+
const workspaceRoot = await this.createWorkspace();
|
|
1096
|
+
try {
|
|
1097
|
+
const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
|
|
1098
|
+
inputFiles,
|
|
1099
|
+
workspaceRoot,
|
|
1100
|
+
originalGuidelines
|
|
1101
|
+
);
|
|
1102
|
+
const promptContent = buildPromptDocument(request, mirroredInputFiles, {
|
|
1103
|
+
guidelinePatterns: request.guideline_patterns,
|
|
1104
|
+
guidelineOverrides: guidelineMirrors
|
|
1105
|
+
});
|
|
1106
|
+
const promptFile = path4.join(workspaceRoot, PROMPT_FILENAME);
|
|
1107
|
+
await writeFile(promptFile, promptContent, "utf8");
|
|
1108
|
+
const args = this.buildCodexArgs();
|
|
1109
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
1110
|
+
const result = await this.executeCodex(args, cwd, promptContent, request.signal);
|
|
1111
|
+
if (result.timedOut) {
|
|
1112
|
+
throw new Error(
|
|
1113
|
+
`Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
1114
|
+
);
|
|
1115
|
+
}
|
|
1116
|
+
if (result.exitCode !== 0) {
|
|
1117
|
+
const detail = pickDetail(result.stderr, result.stdout);
|
|
1118
|
+
const prefix = `Codex CLI exited with code ${result.exitCode}`;
|
|
1119
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
1120
|
+
}
|
|
1121
|
+
const parsed = parseCodexJson(result.stdout);
|
|
1122
|
+
const assistantText = extractAssistantText(parsed);
|
|
1123
|
+
return {
|
|
1124
|
+
text: assistantText,
|
|
1125
|
+
raw: {
|
|
1126
|
+
response: parsed,
|
|
1127
|
+
stdout: result.stdout,
|
|
1128
|
+
stderr: result.stderr,
|
|
1129
|
+
exitCode: result.exitCode,
|
|
1130
|
+
args,
|
|
1131
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1132
|
+
promptFile,
|
|
1133
|
+
workspace: workspaceRoot,
|
|
1134
|
+
inputFiles: mirroredInputFiles
|
|
1135
|
+
}
|
|
1136
|
+
};
|
|
1137
|
+
} finally {
|
|
1138
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
async ensureEnvironmentReady() {
|
|
1142
|
+
if (!this.environmentCheck) {
|
|
1143
|
+
this.environmentCheck = this.validateEnvironment();
|
|
1144
|
+
}
|
|
1145
|
+
await this.environmentCheck;
|
|
1146
|
+
}
|
|
1147
|
+
async validateEnvironment() {
|
|
1148
|
+
this.resolvedExecutable = await locateExecutable(this.config.executable);
|
|
1149
|
+
}
|
|
1150
|
+
resolveCwd(workspaceRoot) {
|
|
1151
|
+
if (!this.config.cwd) {
|
|
1152
|
+
return workspaceRoot;
|
|
1153
|
+
}
|
|
1154
|
+
return path4.resolve(this.config.cwd);
|
|
1155
|
+
}
|
|
1156
|
+
buildCodexArgs() {
|
|
1157
|
+
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
1158
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
1159
|
+
args.push(...this.config.args);
|
|
1160
|
+
}
|
|
1161
|
+
args.push("-");
|
|
1162
|
+
return args;
|
|
1163
|
+
}
|
|
1164
|
+
async executeCodex(args, cwd, promptContent, signal) {
|
|
1165
|
+
try {
|
|
1166
|
+
return await this.runCodex({
|
|
1167
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1168
|
+
args,
|
|
1169
|
+
cwd,
|
|
1170
|
+
prompt: promptContent,
|
|
1171
|
+
timeoutMs: this.config.timeoutMs,
|
|
1172
|
+
env: process.env,
|
|
1173
|
+
signal
|
|
1174
|
+
});
|
|
1175
|
+
} catch (error) {
|
|
1176
|
+
const err = error;
|
|
1177
|
+
if (err.code === "ENOENT") {
|
|
1178
|
+
throw new Error(
|
|
1179
|
+
`Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
1180
|
+
);
|
|
1181
|
+
}
|
|
1182
|
+
throw error;
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
|
|
1186
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1187
|
+
return {
|
|
1188
|
+
mirroredInputFiles: void 0,
|
|
1189
|
+
guidelineMirrors: /* @__PURE__ */ new Set()
|
|
1190
|
+
};
|
|
1191
|
+
}
|
|
1192
|
+
const filesRoot = path4.join(workspaceRoot, FILES_DIR);
|
|
1193
|
+
await mkdir(filesRoot, { recursive: true });
|
|
1194
|
+
const mirrored = [];
|
|
1195
|
+
const guidelineMirrors = /* @__PURE__ */ new Set();
|
|
1196
|
+
const nameCounts = /* @__PURE__ */ new Map();
|
|
1197
|
+
for (const inputFile of inputFiles) {
|
|
1198
|
+
const absoluteSource = path4.resolve(inputFile);
|
|
1199
|
+
const baseName = path4.basename(absoluteSource);
|
|
1200
|
+
const count = nameCounts.get(baseName) ?? 0;
|
|
1201
|
+
nameCounts.set(baseName, count + 1);
|
|
1202
|
+
const finalName = count === 0 ? baseName : `${baseName}.${count}`;
|
|
1203
|
+
const destination = path4.join(filesRoot, finalName);
|
|
1204
|
+
await copyFile(absoluteSource, destination);
|
|
1205
|
+
const resolvedDestination = path4.resolve(destination);
|
|
1206
|
+
mirrored.push(resolvedDestination);
|
|
1207
|
+
if (guidelineOriginals.has(absoluteSource)) {
|
|
1208
|
+
guidelineMirrors.add(resolvedDestination);
|
|
1209
|
+
}
|
|
1210
|
+
}
|
|
1211
|
+
return {
|
|
1212
|
+
mirroredInputFiles: mirrored,
|
|
1213
|
+
guidelineMirrors
|
|
1214
|
+
};
|
|
1215
|
+
}
|
|
1216
|
+
async createWorkspace() {
|
|
1217
|
+
return await mkdtemp(path4.join(tmpdir(), WORKSPACE_PREFIX));
|
|
1218
|
+
}
|
|
1219
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
1220
|
+
try {
|
|
1221
|
+
await rm(workspaceRoot, { recursive: true, force: true });
|
|
1222
|
+
} catch {
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
1225
|
+
};
|
|
1226
|
+
async function locateExecutable(candidate) {
|
|
1227
|
+
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1228
|
+
if (includesPathSeparator) {
|
|
1229
|
+
const resolved = path4.isAbsolute(candidate) ? candidate : path4.resolve(candidate);
|
|
1230
|
+
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
1231
|
+
await access2(executablePath, constants2.F_OK);
|
|
1232
|
+
return executablePath;
|
|
1233
|
+
}
|
|
1234
|
+
const locator = process.platform === "win32" ? "where" : "which";
|
|
1235
|
+
try {
|
|
1236
|
+
const { stdout } = await execAsync2(`${locator} ${candidate}`);
|
|
1237
|
+
const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
1238
|
+
const preferred = selectExecutableCandidate(lines);
|
|
1239
|
+
if (preferred) {
|
|
1240
|
+
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
1241
|
+
await access2(executablePath, constants2.F_OK);
|
|
1242
|
+
return executablePath;
|
|
1243
|
+
}
|
|
1244
|
+
} catch {
|
|
1245
|
+
}
|
|
1246
|
+
throw new Error(`Codex executable '${candidate}' was not found on PATH`);
|
|
1247
|
+
}
|
|
1248
|
+
function selectExecutableCandidate(candidates) {
|
|
1249
|
+
if (candidates.length === 0) {
|
|
1250
|
+
return void 0;
|
|
1251
|
+
}
|
|
1252
|
+
if (process.platform !== "win32") {
|
|
1253
|
+
return candidates[0];
|
|
1254
|
+
}
|
|
1255
|
+
const extensions = getWindowsExecutableExtensions();
|
|
1256
|
+
for (const ext of extensions) {
|
|
1257
|
+
const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
|
|
1258
|
+
if (match) {
|
|
1259
|
+
return match;
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
return candidates[0];
|
|
1263
|
+
}
|
|
1264
|
+
async function ensureWindowsExecutableVariant(candidate) {
|
|
1265
|
+
if (process.platform !== "win32") {
|
|
1266
|
+
return candidate;
|
|
1267
|
+
}
|
|
1268
|
+
if (hasExecutableExtension(candidate)) {
|
|
1269
|
+
return candidate;
|
|
1270
|
+
}
|
|
1271
|
+
const extensions = getWindowsExecutableExtensions();
|
|
1272
|
+
for (const ext of extensions) {
|
|
1273
|
+
const withExtension = `${candidate}${ext}`;
|
|
1274
|
+
try {
|
|
1275
|
+
await access2(withExtension, constants2.F_OK);
|
|
1276
|
+
return withExtension;
|
|
1277
|
+
} catch {
|
|
1278
|
+
}
|
|
1279
|
+
}
|
|
1280
|
+
return candidate;
|
|
1281
|
+
}
|
|
1282
|
+
function hasExecutableExtension(candidate) {
|
|
1283
|
+
const lower = candidate.toLowerCase();
|
|
1284
|
+
return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
|
|
1285
|
+
}
|
|
1286
|
+
var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
|
|
1287
|
+
function getWindowsExecutableExtensions() {
|
|
1288
|
+
if (process.platform !== "win32") {
|
|
1289
|
+
return [];
|
|
1290
|
+
}
|
|
1291
|
+
const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
|
|
1292
|
+
return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
|
|
1293
|
+
}
|
|
1294
|
+
function parseCodexJson(output) {
|
|
1295
|
+
const trimmed = output.trim();
|
|
1296
|
+
if (trimmed.length === 0) {
|
|
1297
|
+
throw new Error("Codex CLI produced no output in --json mode");
|
|
1298
|
+
}
|
|
1299
|
+
try {
|
|
1300
|
+
return JSON.parse(trimmed);
|
|
1301
|
+
} catch {
|
|
1302
|
+
const lineObjects = parseJsonLines(trimmed);
|
|
1303
|
+
if (lineObjects) {
|
|
1304
|
+
return lineObjects;
|
|
1305
|
+
}
|
|
1306
|
+
const lastBrace = trimmed.lastIndexOf("{");
|
|
1307
|
+
if (lastBrace >= 0) {
|
|
1308
|
+
const candidate = trimmed.slice(lastBrace);
|
|
1309
|
+
try {
|
|
1310
|
+
return JSON.parse(candidate);
|
|
1311
|
+
} catch {
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
const preview = trimmed.slice(0, 200);
|
|
1315
|
+
throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
|
|
1316
|
+
}
|
|
1317
|
+
}
|
|
1318
|
+
function extractAssistantText(parsed) {
|
|
1319
|
+
if (Array.isArray(parsed)) {
|
|
1320
|
+
const text = extractFromEventStream(parsed);
|
|
1321
|
+
if (text) {
|
|
1322
|
+
return text;
|
|
1323
|
+
}
|
|
1324
|
+
}
|
|
1325
|
+
if (!parsed || typeof parsed !== "object") {
|
|
1326
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
1327
|
+
}
|
|
1328
|
+
const record = parsed;
|
|
1329
|
+
const eventText = extractFromEvent(record);
|
|
1330
|
+
if (eventText) {
|
|
1331
|
+
return eventText;
|
|
1332
|
+
}
|
|
1333
|
+
const messages = Array.isArray(record.messages) ? record.messages : void 0;
|
|
1334
|
+
if (messages) {
|
|
1335
|
+
for (let index = messages.length - 1; index >= 0; index -= 1) {
|
|
1336
|
+
const entry = messages[index];
|
|
1337
|
+
if (!entry || typeof entry !== "object") {
|
|
1338
|
+
continue;
|
|
1339
|
+
}
|
|
1340
|
+
const role = entry.role;
|
|
1341
|
+
if (role !== "assistant") {
|
|
1342
|
+
continue;
|
|
1343
|
+
}
|
|
1344
|
+
const content = entry.content;
|
|
1345
|
+
const flattened = flattenContent(content);
|
|
1346
|
+
if (flattened) {
|
|
1347
|
+
return flattened;
|
|
1348
|
+
}
|
|
1349
|
+
}
|
|
1350
|
+
}
|
|
1351
|
+
const response = record.response;
|
|
1352
|
+
if (response && typeof response === "object") {
|
|
1353
|
+
const content = response.content;
|
|
1354
|
+
const flattened = flattenContent(content);
|
|
1355
|
+
if (flattened) {
|
|
1356
|
+
return flattened;
|
|
1357
|
+
}
|
|
1358
|
+
}
|
|
1359
|
+
const output = record.output;
|
|
1360
|
+
const flattenedOutput = flattenContent(output);
|
|
1361
|
+
if (flattenedOutput) {
|
|
1362
|
+
return flattenedOutput;
|
|
1363
|
+
}
|
|
1364
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
1365
|
+
}
|
|
1366
|
+
function extractFromEventStream(events) {
|
|
1367
|
+
for (let index = events.length - 1; index >= 0; index -= 1) {
|
|
1368
|
+
const candidate = events[index];
|
|
1369
|
+
const text = extractFromEvent(candidate);
|
|
1370
|
+
if (text) {
|
|
1371
|
+
return text;
|
|
1372
|
+
}
|
|
1373
|
+
}
|
|
1374
|
+
return void 0;
|
|
1375
|
+
}
|
|
1376
|
+
function extractFromEvent(event) {
|
|
1377
|
+
if (!event || typeof event !== "object") {
|
|
1378
|
+
return void 0;
|
|
1379
|
+
}
|
|
1380
|
+
const record = event;
|
|
1381
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
1382
|
+
if (type === JSONL_TYPE_ITEM_COMPLETED) {
|
|
1383
|
+
const item = record.item;
|
|
1384
|
+
const text = extractFromItem(item);
|
|
1385
|
+
if (text) {
|
|
1386
|
+
return text;
|
|
1387
|
+
}
|
|
1388
|
+
}
|
|
1389
|
+
const output = record.output ?? record.content;
|
|
1390
|
+
const flattened = flattenContent(output);
|
|
1391
|
+
if (flattened) {
|
|
1392
|
+
return flattened;
|
|
1393
|
+
}
|
|
1394
|
+
return void 0;
|
|
1395
|
+
}
|
|
1396
|
+
function extractFromItem(item) {
|
|
1397
|
+
if (!item || typeof item !== "object") {
|
|
1398
|
+
return void 0;
|
|
1399
|
+
}
|
|
1400
|
+
const record = item;
|
|
1401
|
+
const itemType = typeof record.type === "string" ? record.type : void 0;
|
|
1402
|
+
if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
|
|
1403
|
+
const text = flattenContent(record.text ?? record.content ?? record.output);
|
|
1404
|
+
if (text) {
|
|
1405
|
+
return text;
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
return void 0;
|
|
1409
|
+
}
|
|
1410
|
+
function flattenContent(value) {
|
|
1411
|
+
if (typeof value === "string") {
|
|
1412
|
+
return value;
|
|
1413
|
+
}
|
|
1414
|
+
if (Array.isArray(value)) {
|
|
1415
|
+
const parts = value.map((segment) => {
|
|
1416
|
+
if (typeof segment === "string") {
|
|
1417
|
+
return segment;
|
|
1418
|
+
}
|
|
1419
|
+
if (segment && typeof segment === "object" && "text" in segment) {
|
|
1420
|
+
const text = segment.text;
|
|
1421
|
+
return typeof text === "string" ? text : void 0;
|
|
1422
|
+
}
|
|
1423
|
+
return void 0;
|
|
1424
|
+
}).filter((part) => typeof part === "string" && part.length > 0);
|
|
1425
|
+
return parts.length > 0 ? parts.join(" \n") : void 0;
|
|
1426
|
+
}
|
|
1427
|
+
if (value && typeof value === "object" && "text" in value) {
|
|
1428
|
+
const text = value.text;
|
|
1429
|
+
return typeof text === "string" ? text : void 0;
|
|
1430
|
+
}
|
|
1431
|
+
return void 0;
|
|
1432
|
+
}
|
|
1433
|
+
function parseJsonLines(output) {
|
|
1434
|
+
const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
1435
|
+
if (lines.length <= 1) {
|
|
1436
|
+
return void 0;
|
|
1437
|
+
}
|
|
1438
|
+
const parsed = [];
|
|
1439
|
+
for (const line of lines) {
|
|
1440
|
+
try {
|
|
1441
|
+
parsed.push(JSON.parse(line));
|
|
1442
|
+
} catch {
|
|
1443
|
+
return void 0;
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
return parsed;
|
|
1447
|
+
}
|
|
1448
|
+
function pickDetail(stderr, stdout) {
|
|
1449
|
+
const errorText = stderr.trim();
|
|
1450
|
+
if (errorText.length > 0) {
|
|
1451
|
+
return errorText;
|
|
1452
|
+
}
|
|
1453
|
+
const stdoutText = stdout.trim();
|
|
1454
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
1455
|
+
}
|
|
1456
|
+
function formatTimeoutSuffix2(timeoutMs) {
|
|
1457
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
1458
|
+
return "";
|
|
1459
|
+
}
|
|
1460
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
1461
|
+
return ` after ${seconds}s`;
|
|
1462
|
+
}
|
|
1463
|
+
async function defaultCodexRunner(options) {
|
|
1464
|
+
return await new Promise((resolve, reject) => {
|
|
1465
|
+
const child = spawn(options.executable, options.args, {
|
|
1466
|
+
cwd: options.cwd,
|
|
1467
|
+
env: options.env,
|
|
1468
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
1469
|
+
shell: shouldShellExecute(options.executable)
|
|
1470
|
+
});
|
|
1471
|
+
let stdout = "";
|
|
1472
|
+
let stderr = "";
|
|
1473
|
+
let timedOut = false;
|
|
1474
|
+
const onAbort = () => {
|
|
1475
|
+
child.kill("SIGTERM");
|
|
1476
|
+
};
|
|
1477
|
+
if (options.signal) {
|
|
1478
|
+
if (options.signal.aborted) {
|
|
1479
|
+
onAbort();
|
|
1480
|
+
} else {
|
|
1481
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
let timeoutHandle;
|
|
1485
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
1486
|
+
timeoutHandle = setTimeout(() => {
|
|
1487
|
+
timedOut = true;
|
|
1488
|
+
child.kill("SIGTERM");
|
|
1489
|
+
}, options.timeoutMs);
|
|
1490
|
+
timeoutHandle.unref?.();
|
|
1491
|
+
}
|
|
1492
|
+
child.stdout.setEncoding("utf8");
|
|
1493
|
+
child.stdout.on("data", (chunk) => {
|
|
1494
|
+
stdout += chunk;
|
|
1495
|
+
});
|
|
1496
|
+
child.stderr.setEncoding("utf8");
|
|
1497
|
+
child.stderr.on("data", (chunk) => {
|
|
1498
|
+
stderr += chunk;
|
|
1499
|
+
});
|
|
1500
|
+
child.stdin.end(options.prompt);
|
|
1501
|
+
const cleanup = () => {
|
|
1502
|
+
if (timeoutHandle) {
|
|
1503
|
+
clearTimeout(timeoutHandle);
|
|
1504
|
+
}
|
|
1505
|
+
if (options.signal) {
|
|
1506
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
1507
|
+
}
|
|
1508
|
+
};
|
|
1509
|
+
child.on("error", (error) => {
|
|
1510
|
+
cleanup();
|
|
1511
|
+
reject(error);
|
|
1512
|
+
});
|
|
1513
|
+
child.on("close", (code) => {
|
|
1514
|
+
cleanup();
|
|
1515
|
+
resolve({
|
|
1516
|
+
stdout,
|
|
1517
|
+
stderr,
|
|
1518
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
1519
|
+
timedOut
|
|
1520
|
+
});
|
|
1521
|
+
});
|
|
1522
|
+
});
|
|
1523
|
+
}
|
|
1524
|
+
function shouldShellExecute(executable) {
|
|
1525
|
+
if (process.platform !== "win32") {
|
|
1526
|
+
return false;
|
|
1527
|
+
}
|
|
1528
|
+
const lower = executable.toLowerCase();
|
|
1529
|
+
return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
|
|
1530
|
+
}
|
|
1531
|
+
|
|
673
1532
|
// src/evaluation/providers/mock.ts
|
|
674
1533
|
var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
|
|
675
1534
|
var MockProvider = class {
|
|
@@ -713,6 +1572,7 @@ var MockProvider = class {
|
|
|
713
1572
|
|
|
714
1573
|
// src/evaluation/providers/targets.ts
|
|
715
1574
|
import { z } from "zod";
|
|
1575
|
+
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
|
|
716
1576
|
var BASE_TARGET_SCHEMA = z.object({
|
|
717
1577
|
name: z.string().min(1, "target name is required"),
|
|
718
1578
|
provider: z.string().min(1, "provider is required"),
|
|
@@ -769,6 +1629,16 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
769
1629
|
providerBatching,
|
|
770
1630
|
config: resolveGeminiConfig(parsed, env)
|
|
771
1631
|
};
|
|
1632
|
+
case "codex":
|
|
1633
|
+
case "codex-cli":
|
|
1634
|
+
return {
|
|
1635
|
+
kind: "codex",
|
|
1636
|
+
name: parsed.name,
|
|
1637
|
+
judgeTarget: parsed.judge_target,
|
|
1638
|
+
workers: parsed.workers,
|
|
1639
|
+
providerBatching,
|
|
1640
|
+
config: resolveCodexConfig(parsed, env)
|
|
1641
|
+
};
|
|
772
1642
|
case "mock":
|
|
773
1643
|
return {
|
|
774
1644
|
kind: "mock",
|
|
@@ -788,6 +1658,15 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
788
1658
|
providerBatching,
|
|
789
1659
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
790
1660
|
};
|
|
1661
|
+
case "cli":
|
|
1662
|
+
return {
|
|
1663
|
+
kind: "cli",
|
|
1664
|
+
name: parsed.name,
|
|
1665
|
+
judgeTarget: parsed.judge_target,
|
|
1666
|
+
workers: parsed.workers,
|
|
1667
|
+
providerBatching,
|
|
1668
|
+
config: resolveCliConfig(parsed, env)
|
|
1669
|
+
};
|
|
791
1670
|
default:
|
|
792
1671
|
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
793
1672
|
}
|
|
@@ -855,6 +1734,29 @@ function resolveGeminiConfig(target, env) {
|
|
|
855
1734
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
|
|
856
1735
|
};
|
|
857
1736
|
}
|
|
1737
|
+
function resolveCodexConfig(target, env) {
|
|
1738
|
+
const settings = target.settings ?? {};
|
|
1739
|
+
const executableSource = settings.executable ?? settings.command ?? settings.binary;
|
|
1740
|
+
const argsSource = settings.args ?? settings.arguments;
|
|
1741
|
+
const cwdSource = settings.cwd;
|
|
1742
|
+
const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
|
|
1743
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
1744
|
+
allowLiteral: true,
|
|
1745
|
+
optionalEnv: true
|
|
1746
|
+
}) ?? "codex";
|
|
1747
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
|
|
1748
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
|
|
1749
|
+
allowLiteral: true,
|
|
1750
|
+
optionalEnv: true
|
|
1751
|
+
});
|
|
1752
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
|
|
1753
|
+
return {
|
|
1754
|
+
executable,
|
|
1755
|
+
args,
|
|
1756
|
+
cwd,
|
|
1757
|
+
timeoutMs
|
|
1758
|
+
};
|
|
1759
|
+
}
|
|
858
1760
|
function resolveMockConfig(target) {
|
|
859
1761
|
const settings = target.settings ?? {};
|
|
860
1762
|
const response = typeof settings.response === "string" ? settings.response : void 0;
|
|
@@ -884,6 +1786,125 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
884
1786
|
workspaceTemplate
|
|
885
1787
|
};
|
|
886
1788
|
}
|
|
1789
|
+
function resolveCliConfig(target, env) {
|
|
1790
|
+
const settings = target.settings ?? {};
|
|
1791
|
+
const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
|
|
1792
|
+
const filesFormat = resolveOptionalLiteralString(
|
|
1793
|
+
settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
|
|
1794
|
+
);
|
|
1795
|
+
const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
|
|
1796
|
+
allowLiteral: true,
|
|
1797
|
+
optionalEnv: true
|
|
1798
|
+
});
|
|
1799
|
+
const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
|
|
1800
|
+
const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
|
|
1801
|
+
const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
|
|
1802
|
+
const commandTemplate = resolveString(
|
|
1803
|
+
commandTemplateSource,
|
|
1804
|
+
env,
|
|
1805
|
+
`${target.name} CLI command template`,
|
|
1806
|
+
true
|
|
1807
|
+
);
|
|
1808
|
+
assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
|
|
1809
|
+
return {
|
|
1810
|
+
commandTemplate,
|
|
1811
|
+
filesFormat,
|
|
1812
|
+
cwd,
|
|
1813
|
+
env: envOverrides,
|
|
1814
|
+
timeoutMs,
|
|
1815
|
+
healthcheck
|
|
1816
|
+
};
|
|
1817
|
+
}
|
|
1818
|
+
function resolveEnvOverrides(source, env, targetName) {
|
|
1819
|
+
if (source === void 0 || source === null) {
|
|
1820
|
+
return void 0;
|
|
1821
|
+
}
|
|
1822
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
1823
|
+
throw new Error(`${targetName} env overrides must be an object map of strings`);
|
|
1824
|
+
}
|
|
1825
|
+
const entries = Object.entries(source);
|
|
1826
|
+
const resolved = {};
|
|
1827
|
+
for (const [key, value] of entries) {
|
|
1828
|
+
if (typeof value !== "string") {
|
|
1829
|
+
throw new Error(`${targetName} env override '${key}' must be a string`);
|
|
1830
|
+
}
|
|
1831
|
+
const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
|
|
1832
|
+
resolved[key] = resolvedValue;
|
|
1833
|
+
}
|
|
1834
|
+
return Object.keys(resolved).length > 0 ? resolved : void 0;
|
|
1835
|
+
}
|
|
1836
|
+
function resolveTimeoutMs(source, description) {
|
|
1837
|
+
const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
|
|
1838
|
+
if (seconds === void 0) {
|
|
1839
|
+
return void 0;
|
|
1840
|
+
}
|
|
1841
|
+
if (seconds <= 0) {
|
|
1842
|
+
throw new Error(`${description} must be greater than zero seconds`);
|
|
1843
|
+
}
|
|
1844
|
+
return Math.floor(seconds * 1e3);
|
|
1845
|
+
}
|
|
1846
|
+
function resolveCliHealthcheck(source, env, targetName) {
|
|
1847
|
+
if (source === void 0 || source === null) {
|
|
1848
|
+
return void 0;
|
|
1849
|
+
}
|
|
1850
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
1851
|
+
throw new Error(`${targetName} healthcheck must be an object`);
|
|
1852
|
+
}
|
|
1853
|
+
const candidate = source;
|
|
1854
|
+
const type = candidate.type;
|
|
1855
|
+
const timeoutMs = resolveTimeoutMs(
|
|
1856
|
+
candidate.timeout_seconds ?? candidate.timeoutSeconds,
|
|
1857
|
+
`${targetName} healthcheck timeout`
|
|
1858
|
+
);
|
|
1859
|
+
if (type === "http") {
|
|
1860
|
+
const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
|
|
1861
|
+
return {
|
|
1862
|
+
type: "http",
|
|
1863
|
+
url,
|
|
1864
|
+
timeoutMs
|
|
1865
|
+
};
|
|
1866
|
+
}
|
|
1867
|
+
if (type === "command") {
|
|
1868
|
+
const commandTemplate = resolveString(
|
|
1869
|
+
candidate.command_template ?? candidate.commandTemplate,
|
|
1870
|
+
env,
|
|
1871
|
+
`${targetName} healthcheck command template`,
|
|
1872
|
+
true
|
|
1873
|
+
);
|
|
1874
|
+
assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
|
|
1875
|
+
const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
|
|
1876
|
+
allowLiteral: true,
|
|
1877
|
+
optionalEnv: true
|
|
1878
|
+
});
|
|
1879
|
+
return {
|
|
1880
|
+
type: "command",
|
|
1881
|
+
commandTemplate,
|
|
1882
|
+
timeoutMs,
|
|
1883
|
+
cwd
|
|
1884
|
+
};
|
|
1885
|
+
}
|
|
1886
|
+
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
1887
|
+
}
|
|
1888
|
+
function assertSupportedCliPlaceholders(template, description) {
|
|
1889
|
+
const placeholders = extractCliPlaceholders(template);
|
|
1890
|
+
for (const placeholder of placeholders) {
|
|
1891
|
+
if (!CLI_PLACEHOLDERS.has(placeholder)) {
|
|
1892
|
+
throw new Error(
|
|
1893
|
+
`${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
|
|
1894
|
+
);
|
|
1895
|
+
}
|
|
1896
|
+
}
|
|
1897
|
+
}
|
|
1898
|
+
function extractCliPlaceholders(template) {
|
|
1899
|
+
const matches = template.matchAll(/\{([A-Z_]+)\}/g);
|
|
1900
|
+
const results = [];
|
|
1901
|
+
for (const match of matches) {
|
|
1902
|
+
if (match[1]) {
|
|
1903
|
+
results.push(match[1]);
|
|
1904
|
+
}
|
|
1905
|
+
}
|
|
1906
|
+
return results;
|
|
1907
|
+
}
|
|
887
1908
|
function resolveString(source, env, description, allowLiteral = false) {
|
|
888
1909
|
const value = resolveOptionalString(source, env, description, {
|
|
889
1910
|
allowLiteral,
|
|
@@ -914,11 +1935,14 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
914
1935
|
}
|
|
915
1936
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
916
1937
|
const optionalEnv = options?.optionalEnv ?? false;
|
|
917
|
-
|
|
1938
|
+
const looksLikeEnv = isLikelyEnvReference(trimmed);
|
|
1939
|
+
if (looksLikeEnv) {
|
|
918
1940
|
if (optionalEnv) {
|
|
919
1941
|
return void 0;
|
|
920
1942
|
}
|
|
921
|
-
|
|
1943
|
+
if (!allowLiteral) {
|
|
1944
|
+
throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
|
|
1945
|
+
}
|
|
922
1946
|
}
|
|
923
1947
|
return trimmed;
|
|
924
1948
|
}
|
|
@@ -968,16 +1992,43 @@ function resolveOptionalBoolean(source) {
|
|
|
968
1992
|
function isLikelyEnvReference(value) {
|
|
969
1993
|
return /^[A-Z0-9_]+$/.test(value);
|
|
970
1994
|
}
|
|
1995
|
+
function resolveOptionalStringArray(source, env, description) {
|
|
1996
|
+
if (source === void 0 || source === null) {
|
|
1997
|
+
return void 0;
|
|
1998
|
+
}
|
|
1999
|
+
if (!Array.isArray(source)) {
|
|
2000
|
+
throw new Error(`${description} must be an array of strings`);
|
|
2001
|
+
}
|
|
2002
|
+
if (source.length === 0) {
|
|
2003
|
+
return void 0;
|
|
2004
|
+
}
|
|
2005
|
+
const resolved = [];
|
|
2006
|
+
for (let i = 0; i < source.length; i++) {
|
|
2007
|
+
const item = source[i];
|
|
2008
|
+
if (typeof item !== "string") {
|
|
2009
|
+
throw new Error(`${description}[${i}] must be a string`);
|
|
2010
|
+
}
|
|
2011
|
+
const trimmed = item.trim();
|
|
2012
|
+
if (trimmed.length === 0) {
|
|
2013
|
+
throw new Error(`${description}[${i}] cannot be empty`);
|
|
2014
|
+
}
|
|
2015
|
+
const envValue = env[trimmed];
|
|
2016
|
+
if (envValue !== void 0) {
|
|
2017
|
+
if (envValue.trim().length === 0) {
|
|
2018
|
+
throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
|
|
2019
|
+
}
|
|
2020
|
+
resolved.push(envValue);
|
|
2021
|
+
} else {
|
|
2022
|
+
resolved.push(trimmed);
|
|
2023
|
+
}
|
|
2024
|
+
}
|
|
2025
|
+
return resolved.length > 0 ? resolved : void 0;
|
|
2026
|
+
}
|
|
971
2027
|
|
|
972
2028
|
// src/evaluation/providers/vscode.ts
|
|
973
2029
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
974
|
-
import
|
|
975
|
-
import {
|
|
976
|
-
dispatchAgentSession,
|
|
977
|
-
dispatchBatchAgent,
|
|
978
|
-
getSubagentRoot,
|
|
979
|
-
provisionSubagents
|
|
980
|
-
} from "subagent";
|
|
2030
|
+
import path5 from "node:path";
|
|
2031
|
+
import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
|
|
981
2032
|
var VSCodeProvider = class {
|
|
982
2033
|
id;
|
|
983
2034
|
kind;
|
|
@@ -994,12 +2045,11 @@ var VSCodeProvider = class {
|
|
|
994
2045
|
if (request.signal?.aborted) {
|
|
995
2046
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
996
2047
|
}
|
|
997
|
-
const
|
|
998
|
-
const promptContent =
|
|
2048
|
+
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
2049
|
+
const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
|
|
999
2050
|
const session = await dispatchAgentSession({
|
|
1000
2051
|
userQuery: promptContent,
|
|
1001
|
-
|
|
1002
|
-
extraAttachments: attachments,
|
|
2052
|
+
extraAttachments: inputFiles,
|
|
1003
2053
|
wait: this.config.waitForResponse,
|
|
1004
2054
|
dryRun: this.config.dryRun,
|
|
1005
2055
|
vscodeCmd: this.config.command,
|
|
@@ -1016,7 +2066,7 @@ var VSCodeProvider = class {
|
|
|
1016
2066
|
text: "",
|
|
1017
2067
|
raw: {
|
|
1018
2068
|
session,
|
|
1019
|
-
|
|
2069
|
+
inputFiles
|
|
1020
2070
|
}
|
|
1021
2071
|
};
|
|
1022
2072
|
}
|
|
@@ -1025,7 +2075,7 @@ var VSCodeProvider = class {
|
|
|
1025
2075
|
text: responseText,
|
|
1026
2076
|
raw: {
|
|
1027
2077
|
session,
|
|
1028
|
-
|
|
2078
|
+
inputFiles
|
|
1029
2079
|
}
|
|
1030
2080
|
};
|
|
1031
2081
|
}
|
|
@@ -1035,17 +2085,17 @@ var VSCodeProvider = class {
|
|
|
1035
2085
|
}
|
|
1036
2086
|
const normalizedRequests = requests.map((req) => ({
|
|
1037
2087
|
request: req,
|
|
1038
|
-
|
|
2088
|
+
inputFiles: normalizeAttachments(req.inputFiles)
|
|
1039
2089
|
}));
|
|
1040
|
-
const
|
|
1041
|
-
normalizedRequests.map(({
|
|
2090
|
+
const combinedInputFiles = mergeAttachments(
|
|
2091
|
+
normalizedRequests.map(({ inputFiles }) => inputFiles)
|
|
1042
2092
|
);
|
|
1043
2093
|
const userQueries = normalizedRequests.map(
|
|
1044
|
-
({ request,
|
|
2094
|
+
({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
|
|
1045
2095
|
);
|
|
1046
2096
|
const session = await dispatchBatchAgent({
|
|
1047
2097
|
userQueries,
|
|
1048
|
-
extraAttachments:
|
|
2098
|
+
extraAttachments: combinedInputFiles,
|
|
1049
2099
|
wait: this.config.waitForResponse,
|
|
1050
2100
|
dryRun: this.config.dryRun,
|
|
1051
2101
|
vscodeCmd: this.config.command,
|
|
@@ -1058,12 +2108,12 @@ var VSCodeProvider = class {
|
|
|
1058
2108
|
throw new Error(failure);
|
|
1059
2109
|
}
|
|
1060
2110
|
if (this.config.dryRun) {
|
|
1061
|
-
return normalizedRequests.map(({
|
|
2111
|
+
return normalizedRequests.map(({ inputFiles }) => ({
|
|
1062
2112
|
text: "",
|
|
1063
2113
|
raw: {
|
|
1064
2114
|
session,
|
|
1065
|
-
|
|
1066
|
-
|
|
2115
|
+
inputFiles,
|
|
2116
|
+
allInputFiles: combinedInputFiles
|
|
1067
2117
|
}
|
|
1068
2118
|
}));
|
|
1069
2119
|
}
|
|
@@ -1079,8 +2129,8 @@ var VSCodeProvider = class {
|
|
|
1079
2129
|
text: responseText,
|
|
1080
2130
|
raw: {
|
|
1081
2131
|
session,
|
|
1082
|
-
|
|
1083
|
-
|
|
2132
|
+
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
2133
|
+
allInputFiles: combinedInputFiles,
|
|
1084
2134
|
responseFile
|
|
1085
2135
|
}
|
|
1086
2136
|
});
|
|
@@ -1088,27 +2138,27 @@ var VSCodeProvider = class {
|
|
|
1088
2138
|
return responses;
|
|
1089
2139
|
}
|
|
1090
2140
|
};
|
|
1091
|
-
function
|
|
2141
|
+
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
1092
2142
|
const parts = [];
|
|
1093
|
-
const guidelineFiles =
|
|
2143
|
+
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
1094
2144
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
1095
2145
|
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
1096
2146
|
(file) => !guidelineFiles.includes(file)
|
|
1097
2147
|
);
|
|
1098
|
-
const prereadBlock =
|
|
2148
|
+
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
|
|
1099
2149
|
if (prereadBlock.length > 0) {
|
|
1100
2150
|
parts.push("\n", prereadBlock);
|
|
1101
2151
|
}
|
|
1102
2152
|
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1103
2153
|
return parts.join("\n").trim();
|
|
1104
2154
|
}
|
|
1105
|
-
function
|
|
2155
|
+
function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
1106
2156
|
if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
|
|
1107
2157
|
return "";
|
|
1108
2158
|
}
|
|
1109
2159
|
const buildList = (files) => files.map((absolutePath) => {
|
|
1110
|
-
const fileName =
|
|
1111
|
-
const fileUri =
|
|
2160
|
+
const fileName = path5.basename(absolutePath);
|
|
2161
|
+
const fileUri = pathToFileUri2(absolutePath);
|
|
1112
2162
|
return `* [${fileName}](${fileUri})`;
|
|
1113
2163
|
});
|
|
1114
2164
|
const sections = [];
|
|
@@ -1126,14 +2176,14 @@ ${buildList(attachmentFiles).join("\n")}.`);
|
|
|
1126
2176
|
);
|
|
1127
2177
|
return sections.join("\n");
|
|
1128
2178
|
}
|
|
1129
|
-
function
|
|
2179
|
+
function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
1130
2180
|
if (!attachments || attachments.length === 0) {
|
|
1131
2181
|
return [];
|
|
1132
2182
|
}
|
|
1133
2183
|
const unique = /* @__PURE__ */ new Map();
|
|
1134
2184
|
for (const attachment of attachments) {
|
|
1135
|
-
const absolutePath =
|
|
1136
|
-
const normalized = absolutePath.split(
|
|
2185
|
+
const absolutePath = path5.resolve(attachment);
|
|
2186
|
+
const normalized = absolutePath.split(path5.sep).join("/");
|
|
1137
2187
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1138
2188
|
if (!unique.has(absolutePath)) {
|
|
1139
2189
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1148,15 +2198,15 @@ function collectAttachmentFiles(attachments) {
|
|
|
1148
2198
|
}
|
|
1149
2199
|
const unique = /* @__PURE__ */ new Map();
|
|
1150
2200
|
for (const attachment of attachments) {
|
|
1151
|
-
const absolutePath =
|
|
2201
|
+
const absolutePath = path5.resolve(attachment);
|
|
1152
2202
|
if (!unique.has(absolutePath)) {
|
|
1153
2203
|
unique.set(absolutePath, absolutePath);
|
|
1154
2204
|
}
|
|
1155
2205
|
}
|
|
1156
2206
|
return Array.from(unique.values());
|
|
1157
2207
|
}
|
|
1158
|
-
function
|
|
1159
|
-
const absolutePath =
|
|
2208
|
+
function pathToFileUri2(filePath) {
|
|
2209
|
+
const absolutePath = path5.isAbsolute(filePath) ? filePath : path5.resolve(filePath);
|
|
1160
2210
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1161
2211
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1162
2212
|
return `file:///${normalizedPath}`;
|
|
@@ -1169,7 +2219,7 @@ function normalizeAttachments(attachments) {
|
|
|
1169
2219
|
}
|
|
1170
2220
|
const deduped = /* @__PURE__ */ new Set();
|
|
1171
2221
|
for (const attachment of attachments) {
|
|
1172
|
-
deduped.add(
|
|
2222
|
+
deduped.add(path5.resolve(attachment));
|
|
1173
2223
|
}
|
|
1174
2224
|
return Array.from(deduped);
|
|
1175
2225
|
}
|
|
@@ -1177,8 +2227,8 @@ function mergeAttachments(all) {
|
|
|
1177
2227
|
const deduped = /* @__PURE__ */ new Set();
|
|
1178
2228
|
for (const list of all) {
|
|
1179
2229
|
if (!list) continue;
|
|
1180
|
-
for (const
|
|
1181
|
-
deduped.add(
|
|
2230
|
+
for (const inputFile of list) {
|
|
2231
|
+
deduped.add(path5.resolve(inputFile));
|
|
1182
2232
|
}
|
|
1183
2233
|
}
|
|
1184
2234
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -1223,9 +2273,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
1223
2273
|
}
|
|
1224
2274
|
|
|
1225
2275
|
// src/evaluation/providers/targets-file.ts
|
|
1226
|
-
import { constants as
|
|
1227
|
-
import { access as
|
|
1228
|
-
import
|
|
2276
|
+
import { constants as constants3 } from "node:fs";
|
|
2277
|
+
import { access as access3, readFile as readFile3 } from "node:fs/promises";
|
|
2278
|
+
import path6 from "node:path";
|
|
1229
2279
|
import { parse as parse2 } from "yaml";
|
|
1230
2280
|
function isRecord(value) {
|
|
1231
2281
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -1281,14 +2331,14 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
1281
2331
|
}
|
|
1282
2332
|
async function fileExists3(filePath) {
|
|
1283
2333
|
try {
|
|
1284
|
-
await
|
|
2334
|
+
await access3(filePath, constants3.F_OK);
|
|
1285
2335
|
return true;
|
|
1286
2336
|
} catch {
|
|
1287
2337
|
return false;
|
|
1288
2338
|
}
|
|
1289
2339
|
}
|
|
1290
2340
|
async function readTargetDefinitions(filePath) {
|
|
1291
|
-
const absolutePath =
|
|
2341
|
+
const absolutePath = path6.resolve(filePath);
|
|
1292
2342
|
if (!await fileExists3(absolutePath)) {
|
|
1293
2343
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
1294
2344
|
}
|
|
@@ -1315,6 +2365,10 @@ function createProvider(target) {
|
|
|
1315
2365
|
return new AnthropicProvider(target.name, target.config);
|
|
1316
2366
|
case "gemini":
|
|
1317
2367
|
return new GeminiProvider(target.name, target.config);
|
|
2368
|
+
case "cli":
|
|
2369
|
+
return new CliProvider(target.name, target.config);
|
|
2370
|
+
case "codex":
|
|
2371
|
+
return new CodexProvider(target.name, target.config);
|
|
1318
2372
|
case "mock":
|
|
1319
2373
|
return new MockProvider(target.name, target.config);
|
|
1320
2374
|
case "vscode":
|
|
@@ -1331,230 +2385,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
1331
2385
|
return createProvider(resolved);
|
|
1332
2386
|
}
|
|
1333
2387
|
|
|
1334
|
-
// src/evaluation/
|
|
1335
|
-
var KEY_TERM_MATCH_THRESHOLD = 0.5;
|
|
1336
|
-
var ACTION_WORDS = /* @__PURE__ */ new Set([
|
|
1337
|
-
"use",
|
|
1338
|
-
"avoid",
|
|
1339
|
-
"prefer",
|
|
1340
|
-
"replace",
|
|
1341
|
-
"consider",
|
|
1342
|
-
"ensure",
|
|
1343
|
-
"remove",
|
|
1344
|
-
"add"
|
|
1345
|
-
]);
|
|
1346
|
-
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
1347
|
-
"the",
|
|
1348
|
-
"a",
|
|
1349
|
-
"an",
|
|
1350
|
-
"and",
|
|
1351
|
-
"or",
|
|
1352
|
-
"but",
|
|
1353
|
-
"in",
|
|
1354
|
-
"on",
|
|
1355
|
-
"at",
|
|
1356
|
-
"to",
|
|
1357
|
-
"for",
|
|
1358
|
-
"of",
|
|
1359
|
-
"with",
|
|
1360
|
-
"by",
|
|
1361
|
-
"is",
|
|
1362
|
-
"are",
|
|
1363
|
-
"was",
|
|
1364
|
-
"were",
|
|
1365
|
-
"be",
|
|
1366
|
-
"been",
|
|
1367
|
-
"being",
|
|
1368
|
-
"have",
|
|
1369
|
-
"has",
|
|
1370
|
-
"had",
|
|
1371
|
-
"do",
|
|
1372
|
-
"does",
|
|
1373
|
-
"did",
|
|
1374
|
-
"will",
|
|
1375
|
-
"would",
|
|
1376
|
-
"could",
|
|
1377
|
-
"should"
|
|
1378
|
-
]);
|
|
1379
|
-
var ERROR_PREFIXES = [
|
|
1380
|
-
"error:",
|
|
1381
|
-
"err:",
|
|
1382
|
-
"vs code command failed",
|
|
1383
|
-
"exception",
|
|
1384
|
-
"traceback",
|
|
1385
|
-
"no response file was generated",
|
|
1386
|
-
"timed out",
|
|
1387
|
-
"cli not found"
|
|
1388
|
-
];
|
|
1389
|
-
function extractAspects(expectedResponse) {
|
|
1390
|
-
const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
|
|
1391
|
-
const aspects = [];
|
|
1392
|
-
for (const line of lines) {
|
|
1393
|
-
if (line.length === 0) {
|
|
1394
|
-
continue;
|
|
1395
|
-
}
|
|
1396
|
-
const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
|
|
1397
|
-
if (bulletMatch) {
|
|
1398
|
-
const normalized = normalizeAspect(bulletMatch[2]);
|
|
1399
|
-
if (normalized.length > 0) {
|
|
1400
|
-
aspects.push(normalized);
|
|
1401
|
-
}
|
|
1402
|
-
continue;
|
|
1403
|
-
}
|
|
1404
|
-
const lowered = line.toLowerCase();
|
|
1405
|
-
if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
|
|
1406
|
-
const normalized = normalizeAspect(line);
|
|
1407
|
-
if (normalized.length > 0) {
|
|
1408
|
-
aspects.push(normalized);
|
|
1409
|
-
}
|
|
1410
|
-
}
|
|
1411
|
-
}
|
|
1412
|
-
return aspects;
|
|
1413
|
-
}
|
|
1414
|
-
function calculateHits(candidateResponse, expectedAspects) {
|
|
1415
|
-
const { normalizedText, words } = normalizeCandidate(candidateResponse);
|
|
1416
|
-
const hits = [];
|
|
1417
|
-
for (const aspect of expectedAspects) {
|
|
1418
|
-
if (matchesAspect(aspect, normalizedText, words)) {
|
|
1419
|
-
hits.push(aspect);
|
|
1420
|
-
}
|
|
1421
|
-
}
|
|
1422
|
-
return hits;
|
|
1423
|
-
}
|
|
1424
|
-
function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
|
|
1425
|
-
const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
|
|
1426
|
-
return expectedAspects.filter((aspect) => !hits.has(aspect));
|
|
1427
|
-
}
|
|
1428
|
-
function scoreCandidateResponse(candidateResponse, expectedAspects) {
|
|
1429
|
-
if (expectedAspects.length === 0) {
|
|
1430
|
-
if (isErrorLike(candidateResponse)) {
|
|
1431
|
-
return {
|
|
1432
|
-
score: 0,
|
|
1433
|
-
hits: [],
|
|
1434
|
-
misses: ["Model produced an error instead of an answer."],
|
|
1435
|
-
hitCount: 0,
|
|
1436
|
-
totalAspects: 0,
|
|
1437
|
-
rawAspects: []
|
|
1438
|
-
};
|
|
1439
|
-
}
|
|
1440
|
-
return {
|
|
1441
|
-
score: 1,
|
|
1442
|
-
hits: [],
|
|
1443
|
-
misses: [],
|
|
1444
|
-
hitCount: 0,
|
|
1445
|
-
totalAspects: 0,
|
|
1446
|
-
rawAspects: []
|
|
1447
|
-
};
|
|
1448
|
-
}
|
|
1449
|
-
const hits = calculateHits(candidateResponse, expectedAspects);
|
|
1450
|
-
const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
|
|
1451
|
-
const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
|
|
1452
|
-
return {
|
|
1453
|
-
score,
|
|
1454
|
-
hits,
|
|
1455
|
-
misses,
|
|
1456
|
-
hitCount: hits.length,
|
|
1457
|
-
totalAspects: expectedAspects.length,
|
|
1458
|
-
rawAspects: expectedAspects
|
|
1459
|
-
};
|
|
1460
|
-
}
|
|
1461
|
-
function isErrorLike(text) {
|
|
1462
|
-
if (!text) {
|
|
1463
|
-
return false;
|
|
1464
|
-
}
|
|
1465
|
-
const lowered = text.trim().toLowerCase();
|
|
1466
|
-
return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
|
|
1467
|
-
}
|
|
1468
|
-
function normalizeAspect(aspect) {
|
|
1469
|
-
const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
1470
|
-
return sanitized;
|
|
1471
|
-
}
|
|
1472
|
-
function normalizeCandidate(candidate) {
|
|
1473
|
-
const lowered = candidate.toLowerCase();
|
|
1474
|
-
const normalizedText = lowered.replace(/[^\w\s]/g, " ");
|
|
1475
|
-
const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
|
|
1476
|
-
return { normalizedText, words };
|
|
1477
|
-
}
|
|
1478
|
-
function matchesAspect(aspect, candidateNormalized, candidateWords) {
|
|
1479
|
-
const keyTerms = extractKeyTerms(aspect);
|
|
1480
|
-
if (keyTerms.length === 0) {
|
|
1481
|
-
return false;
|
|
1482
|
-
}
|
|
1483
|
-
const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
|
|
1484
|
-
const ratio = matches / keyTerms.length;
|
|
1485
|
-
if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
|
|
1486
|
-
return true;
|
|
1487
|
-
}
|
|
1488
|
-
const aspectWords = aspect.split(" ");
|
|
1489
|
-
if (aspectWords.length >= 2) {
|
|
1490
|
-
for (let index = 0; index < aspectWords.length - 1; index += 1) {
|
|
1491
|
-
const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
|
|
1492
|
-
if (candidateNormalized.includes(phrase)) {
|
|
1493
|
-
return true;
|
|
1494
|
-
}
|
|
1495
|
-
}
|
|
1496
|
-
}
|
|
1497
|
-
return false;
|
|
1498
|
-
}
|
|
1499
|
-
function extractKeyTerms(aspect, maxTerms = 5) {
|
|
1500
|
-
const terms = [];
|
|
1501
|
-
const words = aspect.split(" ");
|
|
1502
|
-
for (const word of words) {
|
|
1503
|
-
if (word.length <= 2) {
|
|
1504
|
-
continue;
|
|
1505
|
-
}
|
|
1506
|
-
if (STOP_WORDS.has(word)) {
|
|
1507
|
-
continue;
|
|
1508
|
-
}
|
|
1509
|
-
terms.push(word);
|
|
1510
|
-
if (terms.length >= maxTerms) {
|
|
1511
|
-
break;
|
|
1512
|
-
}
|
|
1513
|
-
}
|
|
1514
|
-
return terms;
|
|
1515
|
-
}
|
|
1516
|
-
|
|
1517
|
-
// src/evaluation/grading.ts
|
|
2388
|
+
// src/evaluation/evaluators.ts
|
|
1518
2389
|
import { randomUUID } from "node:crypto";
|
|
1519
|
-
var
|
|
1520
|
-
kind = "heuristic";
|
|
1521
|
-
grade(context) {
|
|
1522
|
-
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1523
|
-
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1524
|
-
const misses = [...result.misses];
|
|
1525
|
-
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
1526
|
-
const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
|
|
1527
|
-
if (firstLine && !misses.includes(firstLine)) {
|
|
1528
|
-
misses.unshift(firstLine);
|
|
1529
|
-
}
|
|
1530
|
-
}
|
|
1531
|
-
return {
|
|
1532
|
-
score: result.score,
|
|
1533
|
-
hits: result.hits,
|
|
1534
|
-
misses,
|
|
1535
|
-
expectedAspectCount: result.totalAspects,
|
|
1536
|
-
rawAspects: result.rawAspects
|
|
1537
|
-
};
|
|
1538
|
-
}
|
|
1539
|
-
};
|
|
1540
|
-
var QualityGrader = class {
|
|
2390
|
+
var LlmJudgeEvaluator = class {
|
|
1541
2391
|
kind = "llm_judge";
|
|
1542
2392
|
resolveJudgeProvider;
|
|
1543
2393
|
maxOutputTokens;
|
|
1544
2394
|
temperature;
|
|
2395
|
+
customPrompt;
|
|
1545
2396
|
constructor(options) {
|
|
1546
2397
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
1547
2398
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
1548
2399
|
this.temperature = options.temperature;
|
|
2400
|
+
this.customPrompt = options.customPrompt;
|
|
1549
2401
|
}
|
|
1550
|
-
async
|
|
2402
|
+
async evaluate(context) {
|
|
1551
2403
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
1552
2404
|
if (!judgeProvider) {
|
|
1553
2405
|
throw new Error("No judge provider available for LLM grading");
|
|
1554
2406
|
}
|
|
1555
2407
|
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2408
|
+
const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
1556
2409
|
const metadata = {
|
|
1557
|
-
systemPrompt:
|
|
2410
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2411
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1558
2412
|
};
|
|
1559
2413
|
const response = await judgeProvider.invoke({
|
|
1560
2414
|
prompt,
|
|
@@ -1569,12 +2423,13 @@ var QualityGrader = class {
|
|
|
1569
2423
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1570
2424
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1571
2425
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
1572
|
-
const
|
|
2426
|
+
const evaluatorRawRequest = {
|
|
1573
2427
|
id: randomUUID(),
|
|
1574
2428
|
provider: judgeProvider.id,
|
|
1575
2429
|
prompt,
|
|
1576
|
-
|
|
1577
|
-
|
|
2430
|
+
target: context.target.name,
|
|
2431
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2432
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1578
2433
|
};
|
|
1579
2434
|
return {
|
|
1580
2435
|
score,
|
|
@@ -1582,7 +2437,7 @@ var QualityGrader = class {
|
|
|
1582
2437
|
misses,
|
|
1583
2438
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
1584
2439
|
reasoning,
|
|
1585
|
-
|
|
2440
|
+
evaluatorRawRequest
|
|
1586
2441
|
};
|
|
1587
2442
|
}
|
|
1588
2443
|
};
|
|
@@ -1700,11 +2555,117 @@ function extractJsonBlob(text) {
|
|
|
1700
2555
|
function isNonEmptyString(value) {
|
|
1701
2556
|
return typeof value === "string" && value.trim().length > 0;
|
|
1702
2557
|
}
|
|
2558
|
+
var CodeEvaluator = class {
|
|
2559
|
+
kind = "code";
|
|
2560
|
+
script;
|
|
2561
|
+
cwd;
|
|
2562
|
+
agentTimeoutMs;
|
|
2563
|
+
constructor(options) {
|
|
2564
|
+
this.script = options.script;
|
|
2565
|
+
this.cwd = options.cwd;
|
|
2566
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
2567
|
+
}
|
|
2568
|
+
async evaluate(context) {
|
|
2569
|
+
const inputPayload = JSON.stringify(
|
|
2570
|
+
{
|
|
2571
|
+
task: context.evalCase.task,
|
|
2572
|
+
outcome: context.evalCase.outcome,
|
|
2573
|
+
expected: context.evalCase.expected_assistant_raw,
|
|
2574
|
+
output: context.candidate,
|
|
2575
|
+
system_message: context.promptInputs.systemMessage ?? "",
|
|
2576
|
+
guideline_paths: context.evalCase.guideline_paths,
|
|
2577
|
+
attachments: context.evalCase.file_paths,
|
|
2578
|
+
user_segments: context.evalCase.user_segments
|
|
2579
|
+
},
|
|
2580
|
+
null,
|
|
2581
|
+
2
|
|
2582
|
+
);
|
|
2583
|
+
try {
|
|
2584
|
+
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
2585
|
+
const parsed = parseJsonSafe(stdout);
|
|
2586
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
2587
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
2588
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
2589
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
2590
|
+
return {
|
|
2591
|
+
score,
|
|
2592
|
+
hits,
|
|
2593
|
+
misses,
|
|
2594
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
2595
|
+
reasoning,
|
|
2596
|
+
evaluatorRawRequest: {
|
|
2597
|
+
script: this.script,
|
|
2598
|
+
...this.cwd ? { cwd: this.cwd } : {}
|
|
2599
|
+
}
|
|
2600
|
+
};
|
|
2601
|
+
} catch (error) {
|
|
2602
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2603
|
+
return {
|
|
2604
|
+
score: 0,
|
|
2605
|
+
hits: [],
|
|
2606
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
2607
|
+
expectedAspectCount: 1,
|
|
2608
|
+
reasoning: message,
|
|
2609
|
+
evaluatorRawRequest: {
|
|
2610
|
+
script: this.script,
|
|
2611
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
2612
|
+
error: message
|
|
2613
|
+
}
|
|
2614
|
+
};
|
|
2615
|
+
}
|
|
2616
|
+
}
|
|
2617
|
+
};
|
|
2618
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
2619
|
+
const { spawn: spawn2 } = await import("node:child_process");
|
|
2620
|
+
return await new Promise((resolve, reject) => {
|
|
2621
|
+
const child = spawn2(scriptPath, {
|
|
2622
|
+
shell: true,
|
|
2623
|
+
cwd
|
|
2624
|
+
});
|
|
2625
|
+
let stdout = "";
|
|
2626
|
+
let stderr = "";
|
|
2627
|
+
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
2628
|
+
child.kill();
|
|
2629
|
+
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
2630
|
+
}, agentTimeoutMs) : void 0;
|
|
2631
|
+
child.stdout?.on("data", (data) => {
|
|
2632
|
+
stdout += data.toString();
|
|
2633
|
+
});
|
|
2634
|
+
child.stderr?.on("data", (data) => {
|
|
2635
|
+
stderr += data.toString();
|
|
2636
|
+
});
|
|
2637
|
+
child.on("error", (error) => {
|
|
2638
|
+
if (timeout !== void 0) {
|
|
2639
|
+
clearTimeout(timeout);
|
|
2640
|
+
}
|
|
2641
|
+
reject(error);
|
|
2642
|
+
});
|
|
2643
|
+
child.on("exit", (code) => {
|
|
2644
|
+
if (timeout !== void 0) {
|
|
2645
|
+
clearTimeout(timeout);
|
|
2646
|
+
}
|
|
2647
|
+
if (code && code !== 0 && stderr.length > 0) {
|
|
2648
|
+
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
2649
|
+
return;
|
|
2650
|
+
}
|
|
2651
|
+
resolve(stdout.trim());
|
|
2652
|
+
});
|
|
2653
|
+
child.stdin?.write(input);
|
|
2654
|
+
child.stdin?.end();
|
|
2655
|
+
});
|
|
2656
|
+
}
|
|
2657
|
+
function parseJsonSafe(payload) {
|
|
2658
|
+
try {
|
|
2659
|
+
return JSON.parse(payload);
|
|
2660
|
+
} catch {
|
|
2661
|
+
return void 0;
|
|
2662
|
+
}
|
|
2663
|
+
}
|
|
1703
2664
|
|
|
1704
2665
|
// src/evaluation/orchestrator.ts
|
|
1705
2666
|
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
1706
|
-
import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
|
|
1707
|
-
import
|
|
2667
|
+
import { mkdir as mkdir2, readFile as readFile4, writeFile as writeFile2 } from "node:fs/promises";
|
|
2668
|
+
import path7 from "node:path";
|
|
1708
2669
|
|
|
1709
2670
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
1710
2671
|
var Node = class {
|
|
@@ -1851,7 +2812,7 @@ async function runEvaluation(options) {
|
|
|
1851
2812
|
targets,
|
|
1852
2813
|
env,
|
|
1853
2814
|
providerFactory,
|
|
1854
|
-
|
|
2815
|
+
evaluators,
|
|
1855
2816
|
maxRetries,
|
|
1856
2817
|
agentTimeoutMs,
|
|
1857
2818
|
promptDumpDir,
|
|
@@ -1910,7 +2871,7 @@ async function runEvaluation(options) {
|
|
|
1910
2871
|
}
|
|
1911
2872
|
return getOrCreateProvider(resolvedJudge);
|
|
1912
2873
|
};
|
|
1913
|
-
const
|
|
2874
|
+
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
1914
2875
|
const primaryProvider = getOrCreateProvider(target);
|
|
1915
2876
|
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
1916
2877
|
if (target.providerBatching && !providerSupportsBatch && verbose) {
|
|
@@ -1933,13 +2894,14 @@ async function runEvaluation(options) {
|
|
|
1933
2894
|
evalCases: filteredEvalCases,
|
|
1934
2895
|
provider: primaryProvider,
|
|
1935
2896
|
target,
|
|
1936
|
-
|
|
2897
|
+
evaluatorRegistry,
|
|
1937
2898
|
promptDumpDir,
|
|
1938
2899
|
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
1939
2900
|
onProgress,
|
|
1940
2901
|
onResult,
|
|
1941
2902
|
verbose,
|
|
1942
|
-
resolveJudgeProvider
|
|
2903
|
+
resolveJudgeProvider,
|
|
2904
|
+
agentTimeoutMs
|
|
1943
2905
|
});
|
|
1944
2906
|
} catch (error) {
|
|
1945
2907
|
if (verbose) {
|
|
@@ -1970,7 +2932,7 @@ async function runEvaluation(options) {
|
|
|
1970
2932
|
evalCase,
|
|
1971
2933
|
provider: primaryProvider,
|
|
1972
2934
|
target,
|
|
1973
|
-
|
|
2935
|
+
evaluators: evaluatorRegistry,
|
|
1974
2936
|
maxRetries,
|
|
1975
2937
|
agentTimeoutMs,
|
|
1976
2938
|
promptDumpDir,
|
|
@@ -2036,12 +2998,13 @@ async function runBatchEvaluation(options) {
|
|
|
2036
2998
|
evalCases,
|
|
2037
2999
|
provider,
|
|
2038
3000
|
target,
|
|
2039
|
-
|
|
3001
|
+
evaluatorRegistry,
|
|
2040
3002
|
promptDumpDir,
|
|
2041
3003
|
nowFn,
|
|
2042
3004
|
onProgress,
|
|
2043
3005
|
onResult,
|
|
2044
|
-
resolveJudgeProvider
|
|
3006
|
+
resolveJudgeProvider,
|
|
3007
|
+
agentTimeoutMs
|
|
2045
3008
|
} = options;
|
|
2046
3009
|
const promptInputsList = [];
|
|
2047
3010
|
for (const evalCase of evalCases) {
|
|
@@ -2057,7 +3020,7 @@ async function runBatchEvaluation(options) {
|
|
|
2057
3020
|
prompt: promptInputs.request,
|
|
2058
3021
|
guidelines: promptInputs.guidelines,
|
|
2059
3022
|
guideline_patterns: evalCase.guideline_patterns,
|
|
2060
|
-
|
|
3023
|
+
inputFiles: evalCase.file_paths,
|
|
2061
3024
|
evalCaseId: evalCase.id,
|
|
2062
3025
|
metadata: {
|
|
2063
3026
|
systemPrompt: promptInputs.systemMessage ?? ""
|
|
@@ -2089,23 +3052,19 @@ async function runBatchEvaluation(options) {
|
|
|
2089
3052
|
const evalCase = evalCases[i];
|
|
2090
3053
|
const promptInputs = promptInputsList[i];
|
|
2091
3054
|
const providerResponse = batchResponse[i];
|
|
2092
|
-
|
|
2093
|
-
const graderKind = evalCase.grader ?? "heuristic";
|
|
2094
|
-
const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
|
|
2095
|
-
if (!activeGrader) {
|
|
2096
|
-
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2097
|
-
}
|
|
2098
|
-
let grade;
|
|
3055
|
+
let result;
|
|
2099
3056
|
try {
|
|
2100
|
-
|
|
3057
|
+
result = await evaluateCandidate({
|
|
2101
3058
|
evalCase,
|
|
2102
3059
|
candidate: providerResponse.text ?? "",
|
|
2103
3060
|
target,
|
|
2104
3061
|
provider,
|
|
2105
|
-
|
|
3062
|
+
evaluators: evaluatorRegistry,
|
|
2106
3063
|
promptInputs,
|
|
2107
|
-
|
|
2108
|
-
|
|
3064
|
+
nowFn,
|
|
3065
|
+
attempt: 0,
|
|
3066
|
+
judgeProvider: await resolveJudgeProvider(target),
|
|
3067
|
+
agentTimeoutMs
|
|
2109
3068
|
});
|
|
2110
3069
|
} catch (error) {
|
|
2111
3070
|
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
@@ -2124,28 +3083,6 @@ async function runBatchEvaluation(options) {
|
|
|
2124
3083
|
}
|
|
2125
3084
|
continue;
|
|
2126
3085
|
}
|
|
2127
|
-
const completedAt = nowFn();
|
|
2128
|
-
const rawRequest = {
|
|
2129
|
-
request: promptInputs.request,
|
|
2130
|
-
guidelines: promptInputs.guidelines,
|
|
2131
|
-
guideline_paths: evalCase.guideline_paths,
|
|
2132
|
-
system_message: promptInputs.systemMessage ?? ""
|
|
2133
|
-
};
|
|
2134
|
-
const result = {
|
|
2135
|
-
eval_id: evalCase.id,
|
|
2136
|
-
conversation_id: evalCase.conversation_id,
|
|
2137
|
-
score: grade.score,
|
|
2138
|
-
hits: grade.hits,
|
|
2139
|
-
misses: grade.misses,
|
|
2140
|
-
model_answer: providerResponse.text ?? "",
|
|
2141
|
-
expected_aspect_count: grade.expectedAspectCount,
|
|
2142
|
-
target: target.name,
|
|
2143
|
-
timestamp: completedAt.toISOString(),
|
|
2144
|
-
reasoning: grade.reasoning,
|
|
2145
|
-
raw_aspects: grade.rawAspects,
|
|
2146
|
-
raw_request: rawRequest,
|
|
2147
|
-
grader_raw_request: grade.graderRawRequest
|
|
2148
|
-
};
|
|
2149
3086
|
results.push(result);
|
|
2150
3087
|
if (onResult) {
|
|
2151
3088
|
await onResult(result);
|
|
@@ -2167,7 +3104,7 @@ async function runEvalCase(options) {
|
|
|
2167
3104
|
evalCase,
|
|
2168
3105
|
provider,
|
|
2169
3106
|
target,
|
|
2170
|
-
|
|
3107
|
+
evaluators,
|
|
2171
3108
|
now,
|
|
2172
3109
|
maxRetries,
|
|
2173
3110
|
agentTimeoutMs,
|
|
@@ -2222,27 +3159,49 @@ async function runEvalCase(options) {
|
|
|
2222
3159
|
if (cacheKey && cache && !cachedResponse) {
|
|
2223
3160
|
await cache.set(cacheKey, providerResponse);
|
|
2224
3161
|
}
|
|
2225
|
-
const graderKind = evalCase.grader ?? "heuristic";
|
|
2226
|
-
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
2227
|
-
if (!activeGrader) {
|
|
2228
|
-
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2229
|
-
}
|
|
2230
|
-
let grade;
|
|
2231
3162
|
try {
|
|
2232
|
-
|
|
2233
|
-
grade = await activeGrader.grade({
|
|
3163
|
+
return await evaluateCandidate({
|
|
2234
3164
|
evalCase,
|
|
2235
3165
|
candidate: providerResponse.text ?? "",
|
|
2236
3166
|
target,
|
|
2237
3167
|
provider,
|
|
2238
|
-
|
|
3168
|
+
evaluators,
|
|
2239
3169
|
promptInputs,
|
|
2240
|
-
|
|
2241
|
-
|
|
3170
|
+
nowFn,
|
|
3171
|
+
attempt,
|
|
3172
|
+
judgeProvider,
|
|
3173
|
+
agentTimeoutMs
|
|
2242
3174
|
});
|
|
2243
3175
|
} catch (error) {
|
|
2244
3176
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2245
3177
|
}
|
|
3178
|
+
}
|
|
3179
|
+
async function evaluateCandidate(options) {
|
|
3180
|
+
const {
|
|
3181
|
+
evalCase,
|
|
3182
|
+
candidate,
|
|
3183
|
+
target,
|
|
3184
|
+
provider,
|
|
3185
|
+
evaluators,
|
|
3186
|
+
promptInputs,
|
|
3187
|
+
nowFn,
|
|
3188
|
+
attempt,
|
|
3189
|
+
judgeProvider,
|
|
3190
|
+
agentTimeoutMs
|
|
3191
|
+
} = options;
|
|
3192
|
+
const gradeTimestamp = nowFn();
|
|
3193
|
+
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
3194
|
+
evalCase,
|
|
3195
|
+
candidate,
|
|
3196
|
+
target,
|
|
3197
|
+
provider,
|
|
3198
|
+
evaluators,
|
|
3199
|
+
attempt,
|
|
3200
|
+
promptInputs,
|
|
3201
|
+
now: gradeTimestamp,
|
|
3202
|
+
judgeProvider,
|
|
3203
|
+
agentTimeoutMs
|
|
3204
|
+
});
|
|
2246
3205
|
const completedAt = nowFn();
|
|
2247
3206
|
const rawRequest = {
|
|
2248
3207
|
request: promptInputs.request,
|
|
@@ -2253,28 +3212,200 @@ async function runEvalCase(options) {
|
|
|
2253
3212
|
return {
|
|
2254
3213
|
eval_id: evalCase.id,
|
|
2255
3214
|
conversation_id: evalCase.conversation_id,
|
|
2256
|
-
score:
|
|
2257
|
-
hits:
|
|
2258
|
-
misses:
|
|
2259
|
-
model_answer:
|
|
2260
|
-
expected_aspect_count:
|
|
3215
|
+
score: score.score,
|
|
3216
|
+
hits: score.hits,
|
|
3217
|
+
misses: score.misses,
|
|
3218
|
+
model_answer: candidate,
|
|
3219
|
+
expected_aspect_count: score.expectedAspectCount,
|
|
2261
3220
|
target: target.name,
|
|
2262
3221
|
timestamp: completedAt.toISOString(),
|
|
2263
|
-
reasoning:
|
|
2264
|
-
raw_aspects:
|
|
3222
|
+
reasoning: score.reasoning,
|
|
3223
|
+
raw_aspects: score.rawAspects,
|
|
2265
3224
|
raw_request: rawRequest,
|
|
2266
|
-
|
|
3225
|
+
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3226
|
+
evaluator_results: evaluatorResults
|
|
2267
3227
|
};
|
|
2268
3228
|
}
|
|
3229
|
+
async function runEvaluatorsForCase(options) {
|
|
3230
|
+
const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
|
|
3231
|
+
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
3232
|
+
return runEvaluatorList({
|
|
3233
|
+
evalCase,
|
|
3234
|
+
evaluators: evalCase.evaluators,
|
|
3235
|
+
candidate,
|
|
3236
|
+
target,
|
|
3237
|
+
provider,
|
|
3238
|
+
evaluatorRegistry: evaluators,
|
|
3239
|
+
attempt,
|
|
3240
|
+
promptInputs,
|
|
3241
|
+
now,
|
|
3242
|
+
judgeProvider,
|
|
3243
|
+
agentTimeoutMs
|
|
3244
|
+
});
|
|
3245
|
+
}
|
|
3246
|
+
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
3247
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
|
|
3248
|
+
if (!activeEvaluator) {
|
|
3249
|
+
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
3250
|
+
}
|
|
3251
|
+
const score = await activeEvaluator.evaluate({
|
|
3252
|
+
evalCase,
|
|
3253
|
+
candidate,
|
|
3254
|
+
target,
|
|
3255
|
+
provider,
|
|
3256
|
+
attempt,
|
|
3257
|
+
promptInputs,
|
|
3258
|
+
now,
|
|
3259
|
+
judgeProvider
|
|
3260
|
+
});
|
|
3261
|
+
return { score };
|
|
3262
|
+
}
|
|
3263
|
+
async function runEvaluatorList(options) {
|
|
3264
|
+
const {
|
|
3265
|
+
evalCase,
|
|
3266
|
+
evaluators,
|
|
3267
|
+
candidate,
|
|
3268
|
+
target,
|
|
3269
|
+
provider,
|
|
3270
|
+
evaluatorRegistry,
|
|
3271
|
+
attempt,
|
|
3272
|
+
promptInputs,
|
|
3273
|
+
now,
|
|
3274
|
+
judgeProvider,
|
|
3275
|
+
agentTimeoutMs
|
|
3276
|
+
} = options;
|
|
3277
|
+
const scored = [];
|
|
3278
|
+
const evaluatorResults = [];
|
|
3279
|
+
for (const evaluator of evaluators ?? []) {
|
|
3280
|
+
try {
|
|
3281
|
+
if (evaluator.type === "llm_judge") {
|
|
3282
|
+
const score2 = await runLlmJudgeEvaluator({
|
|
3283
|
+
config: evaluator,
|
|
3284
|
+
evalCase,
|
|
3285
|
+
candidate,
|
|
3286
|
+
target,
|
|
3287
|
+
provider,
|
|
3288
|
+
evaluatorRegistry,
|
|
3289
|
+
attempt,
|
|
3290
|
+
promptInputs,
|
|
3291
|
+
now,
|
|
3292
|
+
judgeProvider
|
|
3293
|
+
});
|
|
3294
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3295
|
+
evaluatorResults.push({
|
|
3296
|
+
name: evaluator.name,
|
|
3297
|
+
type: evaluator.type,
|
|
3298
|
+
score: score2.score,
|
|
3299
|
+
hits: score2.hits,
|
|
3300
|
+
misses: score2.misses,
|
|
3301
|
+
reasoning: score2.reasoning,
|
|
3302
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3303
|
+
});
|
|
3304
|
+
continue;
|
|
3305
|
+
}
|
|
3306
|
+
if (evaluator.type === "code") {
|
|
3307
|
+
const codeEvaluator = new CodeEvaluator({
|
|
3308
|
+
script: evaluator.script,
|
|
3309
|
+
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
3310
|
+
agentTimeoutMs
|
|
3311
|
+
});
|
|
3312
|
+
const score2 = await codeEvaluator.evaluate({
|
|
3313
|
+
evalCase,
|
|
3314
|
+
candidate,
|
|
3315
|
+
target,
|
|
3316
|
+
provider,
|
|
3317
|
+
attempt,
|
|
3318
|
+
promptInputs,
|
|
3319
|
+
now
|
|
3320
|
+
});
|
|
3321
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3322
|
+
evaluatorResults.push({
|
|
3323
|
+
name: evaluator.name,
|
|
3324
|
+
type: evaluator.type,
|
|
3325
|
+
score: score2.score,
|
|
3326
|
+
hits: score2.hits,
|
|
3327
|
+
misses: score2.misses,
|
|
3328
|
+
reasoning: score2.reasoning,
|
|
3329
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3330
|
+
});
|
|
3331
|
+
continue;
|
|
3332
|
+
}
|
|
3333
|
+
} catch (error) {
|
|
3334
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3335
|
+
const fallbackScore = {
|
|
3336
|
+
score: 0,
|
|
3337
|
+
hits: [],
|
|
3338
|
+
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
3339
|
+
expectedAspectCount: 1,
|
|
3340
|
+
reasoning: message
|
|
3341
|
+
};
|
|
3342
|
+
scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
|
|
3343
|
+
evaluatorResults.push({
|
|
3344
|
+
name: evaluator.name ?? "unknown",
|
|
3345
|
+
type: evaluator.type ?? "unknown",
|
|
3346
|
+
score: 0,
|
|
3347
|
+
hits: [],
|
|
3348
|
+
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
3349
|
+
reasoning: message
|
|
3350
|
+
});
|
|
3351
|
+
}
|
|
3352
|
+
}
|
|
3353
|
+
const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
|
|
3354
|
+
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
3355
|
+
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
3356
|
+
const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
|
|
3357
|
+
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
3358
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
3359
|
+
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
3360
|
+
const score = {
|
|
3361
|
+
score: aggregateScore,
|
|
3362
|
+
hits,
|
|
3363
|
+
misses,
|
|
3364
|
+
expectedAspectCount,
|
|
3365
|
+
reasoning,
|
|
3366
|
+
rawAspects: rawAspects.length > 0 ? rawAspects : void 0
|
|
3367
|
+
};
|
|
3368
|
+
return { score, evaluatorResults };
|
|
3369
|
+
}
|
|
3370
|
+
async function runLlmJudgeEvaluator(options) {
|
|
3371
|
+
const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
|
|
3372
|
+
const customPrompt = await resolveCustomPrompt(config);
|
|
3373
|
+
return evaluatorRegistry.llm_judge.evaluate({
|
|
3374
|
+
evalCase,
|
|
3375
|
+
candidate,
|
|
3376
|
+
target,
|
|
3377
|
+
provider,
|
|
3378
|
+
attempt,
|
|
3379
|
+
promptInputs,
|
|
3380
|
+
now,
|
|
3381
|
+
judgeProvider,
|
|
3382
|
+
systemPrompt: customPrompt,
|
|
3383
|
+
evaluator: config,
|
|
3384
|
+
judgeModel: config.model
|
|
3385
|
+
});
|
|
3386
|
+
}
|
|
3387
|
+
async function resolveCustomPrompt(config) {
|
|
3388
|
+
if (config.promptPath) {
|
|
3389
|
+
try {
|
|
3390
|
+
return await readFile4(config.promptPath, "utf8");
|
|
3391
|
+
} catch (error) {
|
|
3392
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3393
|
+
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
3394
|
+
}
|
|
3395
|
+
}
|
|
3396
|
+
return config.prompt;
|
|
3397
|
+
}
|
|
3398
|
+
function isNonEmptyString2(value) {
|
|
3399
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
3400
|
+
}
|
|
2269
3401
|
function filterEvalCases(evalCases, evalId) {
|
|
2270
3402
|
if (!evalId) {
|
|
2271
3403
|
return evalCases;
|
|
2272
3404
|
}
|
|
2273
3405
|
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
2274
3406
|
}
|
|
2275
|
-
function
|
|
2276
|
-
const
|
|
2277
|
-
const llmJudge = overrides?.llm_judge ?? new QualityGrader({
|
|
3407
|
+
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
3408
|
+
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
2278
3409
|
resolveJudgeProvider: async (context) => {
|
|
2279
3410
|
if (context.judgeProvider) {
|
|
2280
3411
|
return context.judgeProvider;
|
|
@@ -2284,15 +3415,14 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
2284
3415
|
});
|
|
2285
3416
|
return {
|
|
2286
3417
|
...overrides,
|
|
2287
|
-
heuristic,
|
|
2288
3418
|
llm_judge: llmJudge
|
|
2289
3419
|
};
|
|
2290
3420
|
}
|
|
2291
3421
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
2292
3422
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2293
3423
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
2294
|
-
const filePath =
|
|
2295
|
-
await
|
|
3424
|
+
const filePath = path7.resolve(directory, filename);
|
|
3425
|
+
await mkdir2(path7.dirname(filePath), { recursive: true });
|
|
2296
3426
|
const payload = {
|
|
2297
3427
|
eval_id: evalCase.id,
|
|
2298
3428
|
request: promptInputs.request,
|
|
@@ -2309,7 +3439,7 @@ function sanitizeFilename(value) {
|
|
|
2309
3439
|
return sanitized.length > 0 ? sanitized : randomUUID2();
|
|
2310
3440
|
}
|
|
2311
3441
|
async function invokeProvider(provider, options) {
|
|
2312
|
-
const { evalCase,
|
|
3442
|
+
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2313
3443
|
const controller = new AbortController();
|
|
2314
3444
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2315
3445
|
if (signal) {
|
|
@@ -2320,7 +3450,7 @@ async function invokeProvider(provider, options) {
|
|
|
2320
3450
|
prompt: promptInputs.request,
|
|
2321
3451
|
guidelines: promptInputs.guidelines,
|
|
2322
3452
|
guideline_patterns: evalCase.guideline_patterns,
|
|
2323
|
-
|
|
3453
|
+
inputFiles: evalCase.file_paths,
|
|
2324
3454
|
evalCaseId: evalCase.id,
|
|
2325
3455
|
attempt,
|
|
2326
3456
|
metadata: {
|
|
@@ -2388,25 +3518,20 @@ function createAgentKernel() {
|
|
|
2388
3518
|
return { status: "stub" };
|
|
2389
3519
|
}
|
|
2390
3520
|
export {
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
QualityGrader,
|
|
3521
|
+
CodeEvaluator,
|
|
3522
|
+
LlmJudgeEvaluator,
|
|
2394
3523
|
TEST_MESSAGE_ROLES,
|
|
2395
3524
|
buildDirectoryChain,
|
|
2396
3525
|
buildPromptInputs,
|
|
2397
3526
|
buildSearchRoots,
|
|
2398
|
-
calculateHits,
|
|
2399
|
-
calculateMisses,
|
|
2400
3527
|
createAgentKernel,
|
|
2401
3528
|
createProvider,
|
|
2402
3529
|
ensureVSCodeSubagents,
|
|
2403
|
-
extractAspects,
|
|
2404
3530
|
extractCodeBlocks,
|
|
2405
3531
|
fileExists,
|
|
2406
3532
|
findGitRoot,
|
|
2407
3533
|
getHitCount,
|
|
2408
|
-
|
|
2409
|
-
isGraderKind,
|
|
3534
|
+
isEvaluatorKind,
|
|
2410
3535
|
isGuidelineFile,
|
|
2411
3536
|
isJsonObject,
|
|
2412
3537
|
isJsonValue,
|
|
@@ -2419,7 +3544,6 @@ export {
|
|
|
2419
3544
|
resolveFileReference,
|
|
2420
3545
|
resolveTargetDefinition,
|
|
2421
3546
|
runEvalCase,
|
|
2422
|
-
runEvaluation
|
|
2423
|
-
scoreCandidateResponse
|
|
3547
|
+
runEvaluation
|
|
2424
3548
|
};
|
|
2425
3549
|
//# sourceMappingURL=index.js.map
|