@agentv/core 0.2.11 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-P4GOYWYH.js → chunk-NL7K4CAK.js} +5 -1
- package/dist/chunk-NL7K4CAK.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +186 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +183 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1519 -396
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +107 -63
- package/dist/index.d.ts +107 -63
- package/dist/index.js +1519 -395
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-P4GOYWYH.js.map +0 -1
- package/dist/chunk-XXNQA4EW.js +0 -140
- package/dist/chunk-XXNQA4EW.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
fileExists,
|
|
6
6
|
findGitRoot,
|
|
7
7
|
resolveFileReference
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-NL7K4CAK.js";
|
|
9
9
|
|
|
10
10
|
// src/evaluation/types.ts
|
|
11
11
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -48,11 +48,10 @@ function isTestMessage(value) {
|
|
|
48
48
|
}
|
|
49
49
|
return candidate.content.every(isJsonObject);
|
|
50
50
|
}
|
|
51
|
-
var
|
|
52
|
-
var
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return typeof value === "string" && GRADER_KIND_SET.has(value);
|
|
51
|
+
var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
|
|
52
|
+
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
53
|
+
function isEvaluatorKind(value) {
|
|
54
|
+
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
56
55
|
}
|
|
57
56
|
function getHitCount(result) {
|
|
58
57
|
return result.hits.length;
|
|
@@ -160,7 +159,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
160
159
|
if (!Array.isArray(rawTestcases)) {
|
|
161
160
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
162
161
|
}
|
|
163
|
-
const
|
|
162
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
164
163
|
const results = [];
|
|
165
164
|
for (const rawEvalcase of rawTestcases) {
|
|
166
165
|
if (!isJsonObject(rawEvalcase)) {
|
|
@@ -283,7 +282,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
283
282
|
const assistantContent = assistantMessages[0]?.content;
|
|
284
283
|
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
285
284
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
286
|
-
const
|
|
285
|
+
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
286
|
+
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
287
287
|
const userFilePaths = [];
|
|
288
288
|
for (const segment of userSegments) {
|
|
289
289
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -306,7 +306,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
306
306
|
file_paths: allFilePaths,
|
|
307
307
|
code_snippets: codeSnippets,
|
|
308
308
|
outcome,
|
|
309
|
-
|
|
309
|
+
evaluator: testCaseEvaluatorKind,
|
|
310
|
+
evaluators
|
|
310
311
|
};
|
|
311
312
|
if (verbose) {
|
|
312
313
|
console.log(`
|
|
@@ -467,14 +468,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
467
468
|
}
|
|
468
469
|
return parts.join(" ");
|
|
469
470
|
}
|
|
470
|
-
function
|
|
471
|
+
async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
472
|
+
const execution = rawEvalCase.execution;
|
|
473
|
+
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
|
|
474
|
+
if (candidateEvaluators === void 0) {
|
|
475
|
+
return void 0;
|
|
476
|
+
}
|
|
477
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
478
|
+
logWarning(`Skipping evaluators for '${evalId}': expected array`);
|
|
479
|
+
return void 0;
|
|
480
|
+
}
|
|
481
|
+
const evaluators = [];
|
|
482
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
483
|
+
if (!isJsonObject(rawEvaluator)) {
|
|
484
|
+
logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
485
|
+
continue;
|
|
486
|
+
}
|
|
487
|
+
const name = asString(rawEvaluator.name);
|
|
488
|
+
const typeValue = rawEvaluator.type;
|
|
489
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
490
|
+
logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
491
|
+
continue;
|
|
492
|
+
}
|
|
493
|
+
if (typeValue === "code") {
|
|
494
|
+
const script = asString(rawEvaluator.script);
|
|
495
|
+
if (!script) {
|
|
496
|
+
logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
497
|
+
continue;
|
|
498
|
+
}
|
|
499
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
500
|
+
let resolvedCwd;
|
|
501
|
+
if (cwd) {
|
|
502
|
+
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
503
|
+
if (resolved.resolvedPath) {
|
|
504
|
+
resolvedCwd = path.resolve(resolved.resolvedPath);
|
|
505
|
+
} else {
|
|
506
|
+
logWarning(
|
|
507
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
508
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
evaluators.push({
|
|
513
|
+
name,
|
|
514
|
+
type: "code",
|
|
515
|
+
script,
|
|
516
|
+
cwd,
|
|
517
|
+
resolvedCwd
|
|
518
|
+
});
|
|
519
|
+
continue;
|
|
520
|
+
}
|
|
521
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
522
|
+
let promptPath;
|
|
523
|
+
if (prompt) {
|
|
524
|
+
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
525
|
+
if (resolved.resolvedPath) {
|
|
526
|
+
promptPath = path.resolve(resolved.resolvedPath);
|
|
527
|
+
} else {
|
|
528
|
+
logWarning(
|
|
529
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
530
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
531
|
+
);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
const model = asString(rawEvaluator.model);
|
|
535
|
+
evaluators.push({
|
|
536
|
+
name,
|
|
537
|
+
type: "llm_judge",
|
|
538
|
+
prompt,
|
|
539
|
+
promptPath,
|
|
540
|
+
model
|
|
541
|
+
});
|
|
542
|
+
}
|
|
543
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
544
|
+
}
|
|
545
|
+
function coerceEvaluator(candidate, contextId) {
|
|
471
546
|
if (typeof candidate !== "string") {
|
|
472
547
|
return void 0;
|
|
473
548
|
}
|
|
474
|
-
if (
|
|
549
|
+
if (isEvaluatorKind(candidate)) {
|
|
475
550
|
return candidate;
|
|
476
551
|
}
|
|
477
|
-
logWarning(`Unknown
|
|
552
|
+
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
478
553
|
return void 0;
|
|
479
554
|
}
|
|
480
555
|
function logWarning(message, details) {
|
|
@@ -670,6 +745,214 @@ var GeminiProvider = class {
|
|
|
670
745
|
}
|
|
671
746
|
};
|
|
672
747
|
|
|
748
|
+
// src/evaluation/providers/cli.ts
|
|
749
|
+
import { exec as execWithCallback } from "node:child_process";
|
|
750
|
+
import path2 from "node:path";
|
|
751
|
+
import { promisify } from "node:util";
|
|
752
|
+
var execAsync = promisify(execWithCallback);
|
|
753
|
+
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
754
|
+
async function defaultCommandRunner(command, options) {
|
|
755
|
+
const execOptions = {
|
|
756
|
+
cwd: options.cwd,
|
|
757
|
+
env: options.env,
|
|
758
|
+
timeout: options.timeoutMs,
|
|
759
|
+
signal: options.signal,
|
|
760
|
+
maxBuffer: DEFAULT_MAX_BUFFER,
|
|
761
|
+
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
762
|
+
};
|
|
763
|
+
try {
|
|
764
|
+
const { stdout, stderr } = await execAsync(command, execOptions);
|
|
765
|
+
return {
|
|
766
|
+
stdout,
|
|
767
|
+
stderr,
|
|
768
|
+
exitCode: 0,
|
|
769
|
+
failed: false,
|
|
770
|
+
timedOut: false,
|
|
771
|
+
signal: null
|
|
772
|
+
};
|
|
773
|
+
} catch (error) {
|
|
774
|
+
const execError = error;
|
|
775
|
+
return {
|
|
776
|
+
stdout: execError.stdout ?? "",
|
|
777
|
+
stderr: execError.stderr ?? "",
|
|
778
|
+
exitCode: typeof execError.code === "number" ? execError.code : null,
|
|
779
|
+
failed: true,
|
|
780
|
+
timedOut: execError.timedOut === true || execError.killed === true,
|
|
781
|
+
signal: execError.signal ?? null
|
|
782
|
+
};
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
var CliProvider = class {
|
|
786
|
+
id;
|
|
787
|
+
kind = "cli";
|
|
788
|
+
targetName;
|
|
789
|
+
supportsBatch = false;
|
|
790
|
+
config;
|
|
791
|
+
runCommand;
|
|
792
|
+
healthcheckPromise;
|
|
793
|
+
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
794
|
+
this.targetName = targetName;
|
|
795
|
+
this.id = `cli:${targetName}`;
|
|
796
|
+
this.config = config;
|
|
797
|
+
this.runCommand = runner;
|
|
798
|
+
}
|
|
799
|
+
async invoke(request) {
|
|
800
|
+
if (request.signal?.aborted) {
|
|
801
|
+
throw new Error("CLI provider request was aborted before execution");
|
|
802
|
+
}
|
|
803
|
+
await this.ensureHealthy(request.signal);
|
|
804
|
+
const templateValues = buildTemplateValues(request, this.config);
|
|
805
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
806
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
807
|
+
const result = await this.runCommand(renderedCommand, {
|
|
808
|
+
cwd: this.config.cwd,
|
|
809
|
+
env,
|
|
810
|
+
timeoutMs: this.config.timeoutMs,
|
|
811
|
+
signal: request.signal
|
|
812
|
+
});
|
|
813
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
814
|
+
if (request.signal?.aborted) {
|
|
815
|
+
throw new Error("CLI provider request was aborted");
|
|
816
|
+
}
|
|
817
|
+
if (result.timedOut) {
|
|
818
|
+
throw new Error(
|
|
819
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
820
|
+
);
|
|
821
|
+
}
|
|
822
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
823
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
824
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
825
|
+
throw new Error(message);
|
|
826
|
+
}
|
|
827
|
+
return {
|
|
828
|
+
text: result.stdout,
|
|
829
|
+
raw: {
|
|
830
|
+
command: renderedCommand,
|
|
831
|
+
stderr: result.stderr,
|
|
832
|
+
exitCode: result.exitCode ?? 0,
|
|
833
|
+
cwd: this.config.cwd
|
|
834
|
+
}
|
|
835
|
+
};
|
|
836
|
+
}
|
|
837
|
+
async ensureHealthy(signal) {
|
|
838
|
+
if (!this.config.healthcheck) {
|
|
839
|
+
return;
|
|
840
|
+
}
|
|
841
|
+
if (!this.healthcheckPromise) {
|
|
842
|
+
this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
|
|
843
|
+
}
|
|
844
|
+
return this.healthcheckPromise;
|
|
845
|
+
}
|
|
846
|
+
async runHealthcheck(healthcheck, signal) {
|
|
847
|
+
if (!healthcheck) {
|
|
848
|
+
return;
|
|
849
|
+
}
|
|
850
|
+
const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
|
|
851
|
+
if (healthcheck.type === "http") {
|
|
852
|
+
const controller = new AbortController();
|
|
853
|
+
const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
|
|
854
|
+
signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
855
|
+
try {
|
|
856
|
+
const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
|
|
857
|
+
if (!response.ok) {
|
|
858
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}`);
|
|
859
|
+
}
|
|
860
|
+
} catch (error) {
|
|
861
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
862
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
|
|
863
|
+
} finally {
|
|
864
|
+
if (timer !== void 0) {
|
|
865
|
+
clearTimeout(timer);
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
return;
|
|
869
|
+
}
|
|
870
|
+
const renderedCommand = renderTemplate(
|
|
871
|
+
healthcheck.commandTemplate,
|
|
872
|
+
buildTemplateValues(
|
|
873
|
+
{
|
|
874
|
+
prompt: "",
|
|
875
|
+
guidelines: "",
|
|
876
|
+
inputFiles: [],
|
|
877
|
+
evalCaseId: "",
|
|
878
|
+
attempt: 0
|
|
879
|
+
},
|
|
880
|
+
this.config
|
|
881
|
+
)
|
|
882
|
+
);
|
|
883
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
884
|
+
const result = await this.runCommand(renderedCommand, {
|
|
885
|
+
cwd: healthcheck.cwd ?? this.config.cwd,
|
|
886
|
+
env,
|
|
887
|
+
timeoutMs,
|
|
888
|
+
signal
|
|
889
|
+
});
|
|
890
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
891
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
892
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
893
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
|
|
894
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
};
|
|
898
|
+
function buildTemplateValues(request, config) {
|
|
899
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
900
|
+
return {
|
|
901
|
+
PROMPT: shellEscape(request.prompt ?? ""),
|
|
902
|
+
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
903
|
+
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
904
|
+
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
905
|
+
FILES: formatFileList(inputFiles, config.filesFormat)
|
|
906
|
+
};
|
|
907
|
+
}
|
|
908
|
+
function normalizeInputFiles(inputFiles) {
|
|
909
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
910
|
+
return void 0;
|
|
911
|
+
}
|
|
912
|
+
const unique = /* @__PURE__ */ new Map();
|
|
913
|
+
for (const inputFile of inputFiles) {
|
|
914
|
+
const absolutePath = path2.resolve(inputFile);
|
|
915
|
+
if (!unique.has(absolutePath)) {
|
|
916
|
+
unique.set(absolutePath, absolutePath);
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
return Array.from(unique.values());
|
|
920
|
+
}
|
|
921
|
+
function formatFileList(files, template) {
|
|
922
|
+
if (!files || files.length === 0) {
|
|
923
|
+
return "";
|
|
924
|
+
}
|
|
925
|
+
const formatter = template ?? "{path}";
|
|
926
|
+
return files.map((filePath) => {
|
|
927
|
+
const escapedPath = shellEscape(filePath);
|
|
928
|
+
const escapedName = shellEscape(path2.basename(filePath));
|
|
929
|
+
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
930
|
+
}).join(" ");
|
|
931
|
+
}
|
|
932
|
+
function renderTemplate(template, values) {
|
|
933
|
+
return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
|
|
934
|
+
const replacement = values[key];
|
|
935
|
+
return replacement !== void 0 ? replacement : match;
|
|
936
|
+
});
|
|
937
|
+
}
|
|
938
|
+
function shellEscape(value) {
|
|
939
|
+
if (value.length === 0) {
|
|
940
|
+
return "''";
|
|
941
|
+
}
|
|
942
|
+
if (process.platform === "win32") {
|
|
943
|
+
const escaped = value.replace(/"/g, '\\"');
|
|
944
|
+
return `"${escaped}"`;
|
|
945
|
+
}
|
|
946
|
+
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
947
|
+
}
|
|
948
|
+
function formatTimeoutSuffix(timeoutMs) {
|
|
949
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
950
|
+
return "";
|
|
951
|
+
}
|
|
952
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
953
|
+
return ` after ${seconds}s`;
|
|
954
|
+
}
|
|
955
|
+
|
|
673
956
|
// src/evaluation/providers/mock.ts
|
|
674
957
|
var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
|
|
675
958
|
var MockProvider = class {
|
|
@@ -713,6 +996,7 @@ var MockProvider = class {
|
|
|
713
996
|
|
|
714
997
|
// src/evaluation/providers/targets.ts
|
|
715
998
|
import { z } from "zod";
|
|
999
|
+
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
|
|
716
1000
|
var BASE_TARGET_SCHEMA = z.object({
|
|
717
1001
|
name: z.string().min(1, "target name is required"),
|
|
718
1002
|
provider: z.string().min(1, "provider is required"),
|
|
@@ -769,6 +1053,16 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
769
1053
|
providerBatching,
|
|
770
1054
|
config: resolveGeminiConfig(parsed, env)
|
|
771
1055
|
};
|
|
1056
|
+
case "codex":
|
|
1057
|
+
case "codex-cli":
|
|
1058
|
+
return {
|
|
1059
|
+
kind: "codex",
|
|
1060
|
+
name: parsed.name,
|
|
1061
|
+
judgeTarget: parsed.judge_target,
|
|
1062
|
+
workers: parsed.workers,
|
|
1063
|
+
providerBatching,
|
|
1064
|
+
config: resolveCodexConfig(parsed, env)
|
|
1065
|
+
};
|
|
772
1066
|
case "mock":
|
|
773
1067
|
return {
|
|
774
1068
|
kind: "mock",
|
|
@@ -788,6 +1082,15 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
788
1082
|
providerBatching,
|
|
789
1083
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
790
1084
|
};
|
|
1085
|
+
case "cli":
|
|
1086
|
+
return {
|
|
1087
|
+
kind: "cli",
|
|
1088
|
+
name: parsed.name,
|
|
1089
|
+
judgeTarget: parsed.judge_target,
|
|
1090
|
+
workers: parsed.workers,
|
|
1091
|
+
providerBatching,
|
|
1092
|
+
config: resolveCliConfig(parsed, env)
|
|
1093
|
+
};
|
|
791
1094
|
default:
|
|
792
1095
|
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
793
1096
|
}
|
|
@@ -855,6 +1158,29 @@ function resolveGeminiConfig(target, env) {
|
|
|
855
1158
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
|
|
856
1159
|
};
|
|
857
1160
|
}
|
|
1161
|
+
function resolveCodexConfig(target, env) {
|
|
1162
|
+
const settings = target.settings ?? {};
|
|
1163
|
+
const executableSource = settings.executable ?? settings.command ?? settings.binary;
|
|
1164
|
+
const argsSource = settings.args ?? settings.arguments;
|
|
1165
|
+
const cwdSource = settings.cwd;
|
|
1166
|
+
const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
|
|
1167
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
1168
|
+
allowLiteral: true,
|
|
1169
|
+
optionalEnv: true
|
|
1170
|
+
}) ?? "codex";
|
|
1171
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
|
|
1172
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
|
|
1173
|
+
allowLiteral: true,
|
|
1174
|
+
optionalEnv: true
|
|
1175
|
+
});
|
|
1176
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
|
|
1177
|
+
return {
|
|
1178
|
+
executable,
|
|
1179
|
+
args,
|
|
1180
|
+
cwd,
|
|
1181
|
+
timeoutMs
|
|
1182
|
+
};
|
|
1183
|
+
}
|
|
858
1184
|
function resolveMockConfig(target) {
|
|
859
1185
|
const settings = target.settings ?? {};
|
|
860
1186
|
const response = typeof settings.response === "string" ? settings.response : void 0;
|
|
@@ -884,6 +1210,125 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
884
1210
|
workspaceTemplate
|
|
885
1211
|
};
|
|
886
1212
|
}
|
|
1213
|
+
function resolveCliConfig(target, env) {
|
|
1214
|
+
const settings = target.settings ?? {};
|
|
1215
|
+
const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
|
|
1216
|
+
const filesFormat = resolveOptionalLiteralString(
|
|
1217
|
+
settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
|
|
1218
|
+
);
|
|
1219
|
+
const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
|
|
1220
|
+
allowLiteral: true,
|
|
1221
|
+
optionalEnv: true
|
|
1222
|
+
});
|
|
1223
|
+
const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
|
|
1224
|
+
const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
|
|
1225
|
+
const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
|
|
1226
|
+
const commandTemplate = resolveString(
|
|
1227
|
+
commandTemplateSource,
|
|
1228
|
+
env,
|
|
1229
|
+
`${target.name} CLI command template`,
|
|
1230
|
+
true
|
|
1231
|
+
);
|
|
1232
|
+
assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
|
|
1233
|
+
return {
|
|
1234
|
+
commandTemplate,
|
|
1235
|
+
filesFormat,
|
|
1236
|
+
cwd,
|
|
1237
|
+
env: envOverrides,
|
|
1238
|
+
timeoutMs,
|
|
1239
|
+
healthcheck
|
|
1240
|
+
};
|
|
1241
|
+
}
|
|
1242
|
+
function resolveEnvOverrides(source, env, targetName) {
|
|
1243
|
+
if (source === void 0 || source === null) {
|
|
1244
|
+
return void 0;
|
|
1245
|
+
}
|
|
1246
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
1247
|
+
throw new Error(`${targetName} env overrides must be an object map of strings`);
|
|
1248
|
+
}
|
|
1249
|
+
const entries = Object.entries(source);
|
|
1250
|
+
const resolved = {};
|
|
1251
|
+
for (const [key, value] of entries) {
|
|
1252
|
+
if (typeof value !== "string") {
|
|
1253
|
+
throw new Error(`${targetName} env override '${key}' must be a string`);
|
|
1254
|
+
}
|
|
1255
|
+
const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
|
|
1256
|
+
resolved[key] = resolvedValue;
|
|
1257
|
+
}
|
|
1258
|
+
return Object.keys(resolved).length > 0 ? resolved : void 0;
|
|
1259
|
+
}
|
|
1260
|
+
function resolveTimeoutMs(source, description) {
|
|
1261
|
+
const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
|
|
1262
|
+
if (seconds === void 0) {
|
|
1263
|
+
return void 0;
|
|
1264
|
+
}
|
|
1265
|
+
if (seconds <= 0) {
|
|
1266
|
+
throw new Error(`${description} must be greater than zero seconds`);
|
|
1267
|
+
}
|
|
1268
|
+
return Math.floor(seconds * 1e3);
|
|
1269
|
+
}
|
|
1270
|
+
function resolveCliHealthcheck(source, env, targetName) {
|
|
1271
|
+
if (source === void 0 || source === null) {
|
|
1272
|
+
return void 0;
|
|
1273
|
+
}
|
|
1274
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
1275
|
+
throw new Error(`${targetName} healthcheck must be an object`);
|
|
1276
|
+
}
|
|
1277
|
+
const candidate = source;
|
|
1278
|
+
const type = candidate.type;
|
|
1279
|
+
const timeoutMs = resolveTimeoutMs(
|
|
1280
|
+
candidate.timeout_seconds ?? candidate.timeoutSeconds,
|
|
1281
|
+
`${targetName} healthcheck timeout`
|
|
1282
|
+
);
|
|
1283
|
+
if (type === "http") {
|
|
1284
|
+
const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
|
|
1285
|
+
return {
|
|
1286
|
+
type: "http",
|
|
1287
|
+
url,
|
|
1288
|
+
timeoutMs
|
|
1289
|
+
};
|
|
1290
|
+
}
|
|
1291
|
+
if (type === "command") {
|
|
1292
|
+
const commandTemplate = resolveString(
|
|
1293
|
+
candidate.command_template ?? candidate.commandTemplate,
|
|
1294
|
+
env,
|
|
1295
|
+
`${targetName} healthcheck command template`,
|
|
1296
|
+
true
|
|
1297
|
+
);
|
|
1298
|
+
assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
|
|
1299
|
+
const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
|
|
1300
|
+
allowLiteral: true,
|
|
1301
|
+
optionalEnv: true
|
|
1302
|
+
});
|
|
1303
|
+
return {
|
|
1304
|
+
type: "command",
|
|
1305
|
+
commandTemplate,
|
|
1306
|
+
timeoutMs,
|
|
1307
|
+
cwd
|
|
1308
|
+
};
|
|
1309
|
+
}
|
|
1310
|
+
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
1311
|
+
}
|
|
1312
|
+
function assertSupportedCliPlaceholders(template, description) {
|
|
1313
|
+
const placeholders = extractCliPlaceholders(template);
|
|
1314
|
+
for (const placeholder of placeholders) {
|
|
1315
|
+
if (!CLI_PLACEHOLDERS.has(placeholder)) {
|
|
1316
|
+
throw new Error(
|
|
1317
|
+
`${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
|
|
1318
|
+
);
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
}
|
|
1322
|
+
function extractCliPlaceholders(template) {
|
|
1323
|
+
const matches = template.matchAll(/\{([A-Z_]+)\}/g);
|
|
1324
|
+
const results = [];
|
|
1325
|
+
for (const match of matches) {
|
|
1326
|
+
if (match[1]) {
|
|
1327
|
+
results.push(match[1]);
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
return results;
|
|
1331
|
+
}
|
|
887
1332
|
function resolveString(source, env, description, allowLiteral = false) {
|
|
888
1333
|
const value = resolveOptionalString(source, env, description, {
|
|
889
1334
|
allowLiteral,
|
|
@@ -914,11 +1359,14 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
914
1359
|
}
|
|
915
1360
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
916
1361
|
const optionalEnv = options?.optionalEnv ?? false;
|
|
917
|
-
|
|
1362
|
+
const looksLikeEnv = isLikelyEnvReference(trimmed);
|
|
1363
|
+
if (looksLikeEnv) {
|
|
918
1364
|
if (optionalEnv) {
|
|
919
1365
|
return void 0;
|
|
920
1366
|
}
|
|
921
|
-
|
|
1367
|
+
if (!allowLiteral) {
|
|
1368
|
+
throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
|
|
1369
|
+
}
|
|
922
1370
|
}
|
|
923
1371
|
return trimmed;
|
|
924
1372
|
}
|
|
@@ -968,16 +1416,43 @@ function resolveOptionalBoolean(source) {
|
|
|
968
1416
|
function isLikelyEnvReference(value) {
|
|
969
1417
|
return /^[A-Z0-9_]+$/.test(value);
|
|
970
1418
|
}
|
|
1419
|
+
function resolveOptionalStringArray(source, env, description) {
|
|
1420
|
+
if (source === void 0 || source === null) {
|
|
1421
|
+
return void 0;
|
|
1422
|
+
}
|
|
1423
|
+
if (!Array.isArray(source)) {
|
|
1424
|
+
throw new Error(`${description} must be an array of strings`);
|
|
1425
|
+
}
|
|
1426
|
+
if (source.length === 0) {
|
|
1427
|
+
return void 0;
|
|
1428
|
+
}
|
|
1429
|
+
const resolved = [];
|
|
1430
|
+
for (let i = 0; i < source.length; i++) {
|
|
1431
|
+
const item = source[i];
|
|
1432
|
+
if (typeof item !== "string") {
|
|
1433
|
+
throw new Error(`${description}[${i}] must be a string`);
|
|
1434
|
+
}
|
|
1435
|
+
const trimmed = item.trim();
|
|
1436
|
+
if (trimmed.length === 0) {
|
|
1437
|
+
throw new Error(`${description}[${i}] cannot be empty`);
|
|
1438
|
+
}
|
|
1439
|
+
const envValue = env[trimmed];
|
|
1440
|
+
if (envValue !== void 0) {
|
|
1441
|
+
if (envValue.trim().length === 0) {
|
|
1442
|
+
throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
|
|
1443
|
+
}
|
|
1444
|
+
resolved.push(envValue);
|
|
1445
|
+
} else {
|
|
1446
|
+
resolved.push(trimmed);
|
|
1447
|
+
}
|
|
1448
|
+
}
|
|
1449
|
+
return resolved.length > 0 ? resolved : void 0;
|
|
1450
|
+
}
|
|
971
1451
|
|
|
972
1452
|
// src/evaluation/providers/vscode.ts
|
|
973
1453
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
974
|
-
import
|
|
975
|
-
import {
|
|
976
|
-
dispatchAgentSession,
|
|
977
|
-
dispatchBatchAgent,
|
|
978
|
-
getSubagentRoot,
|
|
979
|
-
provisionSubagents
|
|
980
|
-
} from "subagent";
|
|
1454
|
+
import path3 from "node:path";
|
|
1455
|
+
import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
|
|
981
1456
|
var VSCodeProvider = class {
|
|
982
1457
|
id;
|
|
983
1458
|
kind;
|
|
@@ -994,12 +1469,11 @@ var VSCodeProvider = class {
|
|
|
994
1469
|
if (request.signal?.aborted) {
|
|
995
1470
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
996
1471
|
}
|
|
997
|
-
const
|
|
998
|
-
const promptContent = buildPromptDocument(request,
|
|
1472
|
+
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
1473
|
+
const promptContent = buildPromptDocument(request, inputFiles, request.guideline_patterns);
|
|
999
1474
|
const session = await dispatchAgentSession({
|
|
1000
1475
|
userQuery: promptContent,
|
|
1001
|
-
|
|
1002
|
-
extraAttachments: attachments,
|
|
1476
|
+
extraAttachments: inputFiles,
|
|
1003
1477
|
wait: this.config.waitForResponse,
|
|
1004
1478
|
dryRun: this.config.dryRun,
|
|
1005
1479
|
vscodeCmd: this.config.command,
|
|
@@ -1016,7 +1490,7 @@ var VSCodeProvider = class {
|
|
|
1016
1490
|
text: "",
|
|
1017
1491
|
raw: {
|
|
1018
1492
|
session,
|
|
1019
|
-
|
|
1493
|
+
inputFiles
|
|
1020
1494
|
}
|
|
1021
1495
|
};
|
|
1022
1496
|
}
|
|
@@ -1025,7 +1499,7 @@ var VSCodeProvider = class {
|
|
|
1025
1499
|
text: responseText,
|
|
1026
1500
|
raw: {
|
|
1027
1501
|
session,
|
|
1028
|
-
|
|
1502
|
+
inputFiles
|
|
1029
1503
|
}
|
|
1030
1504
|
};
|
|
1031
1505
|
}
|
|
@@ -1035,17 +1509,17 @@ var VSCodeProvider = class {
|
|
|
1035
1509
|
}
|
|
1036
1510
|
const normalizedRequests = requests.map((req) => ({
|
|
1037
1511
|
request: req,
|
|
1038
|
-
|
|
1512
|
+
inputFiles: normalizeAttachments(req.inputFiles)
|
|
1039
1513
|
}));
|
|
1040
|
-
const
|
|
1041
|
-
normalizedRequests.map(({
|
|
1514
|
+
const combinedInputFiles = mergeAttachments(
|
|
1515
|
+
normalizedRequests.map(({ inputFiles }) => inputFiles)
|
|
1042
1516
|
);
|
|
1043
1517
|
const userQueries = normalizedRequests.map(
|
|
1044
|
-
({ request,
|
|
1518
|
+
({ request, inputFiles }) => buildPromptDocument(request, inputFiles, request.guideline_patterns)
|
|
1045
1519
|
);
|
|
1046
1520
|
const session = await dispatchBatchAgent({
|
|
1047
1521
|
userQueries,
|
|
1048
|
-
extraAttachments:
|
|
1522
|
+
extraAttachments: combinedInputFiles,
|
|
1049
1523
|
wait: this.config.waitForResponse,
|
|
1050
1524
|
dryRun: this.config.dryRun,
|
|
1051
1525
|
vscodeCmd: this.config.command,
|
|
@@ -1058,12 +1532,12 @@ var VSCodeProvider = class {
|
|
|
1058
1532
|
throw new Error(failure);
|
|
1059
1533
|
}
|
|
1060
1534
|
if (this.config.dryRun) {
|
|
1061
|
-
return normalizedRequests.map(({
|
|
1535
|
+
return normalizedRequests.map(({ inputFiles }) => ({
|
|
1062
1536
|
text: "",
|
|
1063
1537
|
raw: {
|
|
1064
1538
|
session,
|
|
1065
|
-
|
|
1066
|
-
|
|
1539
|
+
inputFiles,
|
|
1540
|
+
allInputFiles: combinedInputFiles
|
|
1067
1541
|
}
|
|
1068
1542
|
}));
|
|
1069
1543
|
}
|
|
@@ -1079,8 +1553,8 @@ var VSCodeProvider = class {
|
|
|
1079
1553
|
text: responseText,
|
|
1080
1554
|
raw: {
|
|
1081
1555
|
session,
|
|
1082
|
-
|
|
1083
|
-
|
|
1556
|
+
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
1557
|
+
allInputFiles: combinedInputFiles,
|
|
1084
1558
|
responseFile
|
|
1085
1559
|
}
|
|
1086
1560
|
});
|
|
@@ -1107,7 +1581,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
|
|
|
1107
1581
|
return "";
|
|
1108
1582
|
}
|
|
1109
1583
|
const buildList = (files) => files.map((absolutePath) => {
|
|
1110
|
-
const fileName =
|
|
1584
|
+
const fileName = path3.basename(absolutePath);
|
|
1111
1585
|
const fileUri = pathToFileUri(absolutePath);
|
|
1112
1586
|
return `* [${fileName}](${fileUri})`;
|
|
1113
1587
|
});
|
|
@@ -1132,8 +1606,8 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
|
1132
1606
|
}
|
|
1133
1607
|
const unique = /* @__PURE__ */ new Map();
|
|
1134
1608
|
for (const attachment of attachments) {
|
|
1135
|
-
const absolutePath =
|
|
1136
|
-
const normalized = absolutePath.split(
|
|
1609
|
+
const absolutePath = path3.resolve(attachment);
|
|
1610
|
+
const normalized = absolutePath.split(path3.sep).join("/");
|
|
1137
1611
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1138
1612
|
if (!unique.has(absolutePath)) {
|
|
1139
1613
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1146,86 +1620,662 @@ function collectAttachmentFiles(attachments) {
|
|
|
1146
1620
|
if (!attachments || attachments.length === 0) {
|
|
1147
1621
|
return [];
|
|
1148
1622
|
}
|
|
1149
|
-
const unique = /* @__PURE__ */ new Map();
|
|
1150
|
-
for (const attachment of attachments) {
|
|
1151
|
-
const absolutePath =
|
|
1152
|
-
if (!unique.has(absolutePath)) {
|
|
1153
|
-
unique.set(absolutePath, absolutePath);
|
|
1623
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1624
|
+
for (const attachment of attachments) {
|
|
1625
|
+
const absolutePath = path3.resolve(attachment);
|
|
1626
|
+
if (!unique.has(absolutePath)) {
|
|
1627
|
+
unique.set(absolutePath, absolutePath);
|
|
1628
|
+
}
|
|
1629
|
+
}
|
|
1630
|
+
return Array.from(unique.values());
|
|
1631
|
+
}
|
|
1632
|
+
function pathToFileUri(filePath) {
|
|
1633
|
+
const absolutePath = path3.isAbsolute(filePath) ? filePath : path3.resolve(filePath);
|
|
1634
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1635
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1636
|
+
return `file:///${normalizedPath}`;
|
|
1637
|
+
}
|
|
1638
|
+
return `file://${normalizedPath}`;
|
|
1639
|
+
}
|
|
1640
|
+
function normalizeAttachments(attachments) {
|
|
1641
|
+
if (!attachments || attachments.length === 0) {
|
|
1642
|
+
return void 0;
|
|
1643
|
+
}
|
|
1644
|
+
const deduped = /* @__PURE__ */ new Set();
|
|
1645
|
+
for (const attachment of attachments) {
|
|
1646
|
+
deduped.add(path3.resolve(attachment));
|
|
1647
|
+
}
|
|
1648
|
+
return Array.from(deduped);
|
|
1649
|
+
}
|
|
1650
|
+
function mergeAttachments(all) {
|
|
1651
|
+
const deduped = /* @__PURE__ */ new Set();
|
|
1652
|
+
for (const list of all) {
|
|
1653
|
+
if (!list) continue;
|
|
1654
|
+
for (const inputFile of list) {
|
|
1655
|
+
deduped.add(path3.resolve(inputFile));
|
|
1656
|
+
}
|
|
1657
|
+
}
|
|
1658
|
+
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
1659
|
+
}
|
|
1660
|
+
async function ensureVSCodeSubagents(options) {
|
|
1661
|
+
const { kind, count, verbose = false } = options;
|
|
1662
|
+
const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
|
|
1663
|
+
const subagentRoot = getSubagentRoot(vscodeCmd);
|
|
1664
|
+
try {
|
|
1665
|
+
if (verbose) {
|
|
1666
|
+
console.log(`Provisioning ${count} subagent(s) via: subagent ${vscodeCmd} provision`);
|
|
1667
|
+
}
|
|
1668
|
+
const result = await provisionSubagents({
|
|
1669
|
+
targetRoot: subagentRoot,
|
|
1670
|
+
subagents: count,
|
|
1671
|
+
dryRun: false
|
|
1672
|
+
});
|
|
1673
|
+
if (verbose) {
|
|
1674
|
+
if (result.created.length > 0) {
|
|
1675
|
+
console.log(`Created ${result.created.length} new subagent(s)`);
|
|
1676
|
+
}
|
|
1677
|
+
if (result.skippedExisting.length > 0) {
|
|
1678
|
+
console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
|
|
1679
|
+
}
|
|
1680
|
+
console.log(`
|
|
1681
|
+
total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
|
|
1682
|
+
}
|
|
1683
|
+
return {
|
|
1684
|
+
provisioned: true,
|
|
1685
|
+
message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
|
|
1686
|
+
};
|
|
1687
|
+
} catch (error) {
|
|
1688
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1689
|
+
if (verbose) {
|
|
1690
|
+
console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
|
|
1691
|
+
}
|
|
1692
|
+
return {
|
|
1693
|
+
provisioned: false,
|
|
1694
|
+
message: `Provisioning failed: ${errorMessage}`
|
|
1695
|
+
};
|
|
1696
|
+
}
|
|
1697
|
+
}
|
|
1698
|
+
|
|
1699
|
+
// src/evaluation/providers/codex.ts
|
|
1700
|
+
import { exec as execCallback, spawn } from "node:child_process";
|
|
1701
|
+
import { constants as constants2 } from "node:fs";
|
|
1702
|
+
import { access as access2, copyFile, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
|
|
1703
|
+
import { tmpdir } from "node:os";
|
|
1704
|
+
import path5 from "node:path";
|
|
1705
|
+
import { promisify as promisify2 } from "node:util";
|
|
1706
|
+
|
|
1707
|
+
// src/evaluation/providers/preread.ts
|
|
1708
|
+
import path4 from "node:path";
|
|
1709
|
+
function buildPromptDocument2(request, inputFiles, options) {
|
|
1710
|
+
const parts = [];
|
|
1711
|
+
const guidelineFiles = collectGuidelineFiles2(
|
|
1712
|
+
inputFiles,
|
|
1713
|
+
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
1714
|
+
options?.guidelineOverrides
|
|
1715
|
+
);
|
|
1716
|
+
const inputFilesList = collectInputFiles(inputFiles);
|
|
1717
|
+
const nonGuidelineInputFiles = inputFilesList.filter(
|
|
1718
|
+
(file) => !guidelineFiles.includes(file)
|
|
1719
|
+
);
|
|
1720
|
+
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineInputFiles);
|
|
1721
|
+
if (prereadBlock.length > 0) {
|
|
1722
|
+
parts.push("\n", prereadBlock);
|
|
1723
|
+
}
|
|
1724
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1725
|
+
return parts.join("\n").trim();
|
|
1726
|
+
}
|
|
1727
|
+
function normalizeInputFiles2(inputFiles) {
|
|
1728
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1729
|
+
return void 0;
|
|
1730
|
+
}
|
|
1731
|
+
const deduped = /* @__PURE__ */ new Map();
|
|
1732
|
+
for (const inputFile of inputFiles) {
|
|
1733
|
+
const absolutePath = path4.resolve(inputFile);
|
|
1734
|
+
if (!deduped.has(absolutePath)) {
|
|
1735
|
+
deduped.set(absolutePath, absolutePath);
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1738
|
+
return Array.from(deduped.values());
|
|
1739
|
+
}
|
|
1740
|
+
function collectGuidelineFiles2(inputFiles, guidelinePatterns, overrides) {
|
|
1741
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1742
|
+
return [];
|
|
1743
|
+
}
|
|
1744
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1745
|
+
for (const inputFile of inputFiles) {
|
|
1746
|
+
const absolutePath = path4.resolve(inputFile);
|
|
1747
|
+
if (overrides?.has(absolutePath)) {
|
|
1748
|
+
if (!unique.has(absolutePath)) {
|
|
1749
|
+
unique.set(absolutePath, absolutePath);
|
|
1750
|
+
}
|
|
1751
|
+
continue;
|
|
1752
|
+
}
|
|
1753
|
+
const normalized = absolutePath.split(path4.sep).join("/");
|
|
1754
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1755
|
+
if (!unique.has(absolutePath)) {
|
|
1756
|
+
unique.set(absolutePath, absolutePath);
|
|
1757
|
+
}
|
|
1758
|
+
}
|
|
1759
|
+
}
|
|
1760
|
+
return Array.from(unique.values());
|
|
1761
|
+
}
|
|
1762
|
+
function collectInputFiles(inputFiles) {
|
|
1763
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1764
|
+
return [];
|
|
1765
|
+
}
|
|
1766
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1767
|
+
for (const inputFile of inputFiles) {
|
|
1768
|
+
const absolutePath = path4.resolve(inputFile);
|
|
1769
|
+
if (!unique.has(absolutePath)) {
|
|
1770
|
+
unique.set(absolutePath, absolutePath);
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
return Array.from(unique.values());
|
|
1774
|
+
}
|
|
1775
|
+
function buildMandatoryPrereadBlock2(guidelineFiles, inputFiles) {
|
|
1776
|
+
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
1777
|
+
return "";
|
|
1778
|
+
}
|
|
1779
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1780
|
+
const fileName = path4.basename(absolutePath);
|
|
1781
|
+
const fileUri = pathToFileUri2(absolutePath);
|
|
1782
|
+
return `* [${fileName}](${fileUri})`;
|
|
1783
|
+
});
|
|
1784
|
+
const sections = [];
|
|
1785
|
+
if (guidelineFiles.length > 0) {
|
|
1786
|
+
sections.push(`Read all guideline files:
|
|
1787
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1788
|
+
}
|
|
1789
|
+
if (inputFiles.length > 0) {
|
|
1790
|
+
sections.push(`Read all input files:
|
|
1791
|
+
${buildList(inputFiles).join("\n")}.`);
|
|
1792
|
+
}
|
|
1793
|
+
sections.push(
|
|
1794
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1795
|
+
"Then apply system_instructions on the user query below."
|
|
1796
|
+
);
|
|
1797
|
+
return sections.join("\n");
|
|
1798
|
+
}
|
|
1799
|
+
function pathToFileUri2(filePath) {
|
|
1800
|
+
const absolutePath = path4.isAbsolute(filePath) ? filePath : path4.resolve(filePath);
|
|
1801
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1802
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1803
|
+
return `file:///${normalizedPath}`;
|
|
1804
|
+
}
|
|
1805
|
+
return `file://${normalizedPath}`;
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1808
|
+
// src/evaluation/providers/codex.ts
|
|
1809
|
+
var execAsync2 = promisify2(execCallback);
|
|
1810
|
+
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
1811
|
+
var PROMPT_FILENAME = "prompt.md";
|
|
1812
|
+
var FILES_DIR = "files";
|
|
1813
|
+
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
1814
|
+
var CodexProvider = class {
|
|
1815
|
+
id;
|
|
1816
|
+
kind = "codex";
|
|
1817
|
+
targetName;
|
|
1818
|
+
supportsBatch = false;
|
|
1819
|
+
config;
|
|
1820
|
+
runCodex;
|
|
1821
|
+
environmentCheck;
|
|
1822
|
+
resolvedExecutable;
|
|
1823
|
+
constructor(targetName, config, runner = defaultCodexRunner) {
|
|
1824
|
+
this.id = `codex:${targetName}`;
|
|
1825
|
+
this.targetName = targetName;
|
|
1826
|
+
this.config = config;
|
|
1827
|
+
this.runCodex = runner;
|
|
1828
|
+
}
|
|
1829
|
+
async invoke(request) {
|
|
1830
|
+
if (request.signal?.aborted) {
|
|
1831
|
+
throw new Error("Codex provider request was aborted before execution");
|
|
1832
|
+
}
|
|
1833
|
+
await this.ensureEnvironmentReady();
|
|
1834
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
1835
|
+
const originalGuidelines = new Set(
|
|
1836
|
+
collectGuidelineFiles2(inputFiles, request.guideline_patterns).map((file) => path5.resolve(file))
|
|
1837
|
+
);
|
|
1838
|
+
const workspaceRoot = await this.createWorkspace();
|
|
1839
|
+
try {
|
|
1840
|
+
const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
|
|
1841
|
+
inputFiles,
|
|
1842
|
+
workspaceRoot,
|
|
1843
|
+
originalGuidelines
|
|
1844
|
+
);
|
|
1845
|
+
const promptContent = buildPromptDocument2(request, mirroredInputFiles, {
|
|
1846
|
+
guidelinePatterns: request.guideline_patterns,
|
|
1847
|
+
guidelineOverrides: guidelineMirrors
|
|
1848
|
+
});
|
|
1849
|
+
const promptFile = path5.join(workspaceRoot, PROMPT_FILENAME);
|
|
1850
|
+
await writeFile(promptFile, promptContent, "utf8");
|
|
1851
|
+
const args = this.buildCodexArgs();
|
|
1852
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
1853
|
+
const result = await this.executeCodex(args, cwd, promptContent, request.signal);
|
|
1854
|
+
if (result.timedOut) {
|
|
1855
|
+
throw new Error(
|
|
1856
|
+
`Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
1857
|
+
);
|
|
1858
|
+
}
|
|
1859
|
+
if (result.exitCode !== 0) {
|
|
1860
|
+
const detail = pickDetail(result.stderr, result.stdout);
|
|
1861
|
+
const prefix = `Codex CLI exited with code ${result.exitCode}`;
|
|
1862
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
1863
|
+
}
|
|
1864
|
+
const parsed = parseCodexJson(result.stdout);
|
|
1865
|
+
const assistantText = extractAssistantText(parsed);
|
|
1866
|
+
return {
|
|
1867
|
+
text: assistantText,
|
|
1868
|
+
raw: {
|
|
1869
|
+
response: parsed,
|
|
1870
|
+
stdout: result.stdout,
|
|
1871
|
+
stderr: result.stderr,
|
|
1872
|
+
exitCode: result.exitCode,
|
|
1873
|
+
args,
|
|
1874
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1875
|
+
promptFile,
|
|
1876
|
+
workspace: workspaceRoot,
|
|
1877
|
+
inputFiles: mirroredInputFiles
|
|
1878
|
+
}
|
|
1879
|
+
};
|
|
1880
|
+
} finally {
|
|
1881
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
1882
|
+
}
|
|
1883
|
+
}
|
|
1884
|
+
async ensureEnvironmentReady() {
|
|
1885
|
+
if (!this.environmentCheck) {
|
|
1886
|
+
this.environmentCheck = this.validateEnvironment();
|
|
1887
|
+
}
|
|
1888
|
+
await this.environmentCheck;
|
|
1889
|
+
}
|
|
1890
|
+
async validateEnvironment() {
|
|
1891
|
+
this.resolvedExecutable = await locateExecutable(this.config.executable);
|
|
1892
|
+
}
|
|
1893
|
+
resolveCwd(workspaceRoot) {
|
|
1894
|
+
if (!this.config.cwd) {
|
|
1895
|
+
return workspaceRoot;
|
|
1896
|
+
}
|
|
1897
|
+
return path5.resolve(this.config.cwd);
|
|
1898
|
+
}
|
|
1899
|
+
buildCodexArgs() {
|
|
1900
|
+
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
1901
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
1902
|
+
args.push(...this.config.args);
|
|
1903
|
+
}
|
|
1904
|
+
args.push("-");
|
|
1905
|
+
return args;
|
|
1906
|
+
}
|
|
1907
|
+
async executeCodex(args, cwd, promptContent, signal) {
|
|
1908
|
+
try {
|
|
1909
|
+
return await this.runCodex({
|
|
1910
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1911
|
+
args,
|
|
1912
|
+
cwd,
|
|
1913
|
+
prompt: promptContent,
|
|
1914
|
+
timeoutMs: this.config.timeoutMs,
|
|
1915
|
+
env: process.env,
|
|
1916
|
+
signal
|
|
1917
|
+
});
|
|
1918
|
+
} catch (error) {
|
|
1919
|
+
const err = error;
|
|
1920
|
+
if (err.code === "ENOENT") {
|
|
1921
|
+
throw new Error(
|
|
1922
|
+
`Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
1923
|
+
);
|
|
1924
|
+
}
|
|
1925
|
+
throw error;
|
|
1926
|
+
}
|
|
1927
|
+
}
|
|
1928
|
+
async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
|
|
1929
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1930
|
+
return {
|
|
1931
|
+
mirroredInputFiles: void 0,
|
|
1932
|
+
guidelineMirrors: /* @__PURE__ */ new Set()
|
|
1933
|
+
};
|
|
1934
|
+
}
|
|
1935
|
+
const filesRoot = path5.join(workspaceRoot, FILES_DIR);
|
|
1936
|
+
await mkdir(filesRoot, { recursive: true });
|
|
1937
|
+
const mirrored = [];
|
|
1938
|
+
const guidelineMirrors = /* @__PURE__ */ new Set();
|
|
1939
|
+
const nameCounts = /* @__PURE__ */ new Map();
|
|
1940
|
+
for (const inputFile of inputFiles) {
|
|
1941
|
+
const absoluteSource = path5.resolve(inputFile);
|
|
1942
|
+
const baseName = path5.basename(absoluteSource);
|
|
1943
|
+
const count = nameCounts.get(baseName) ?? 0;
|
|
1944
|
+
nameCounts.set(baseName, count + 1);
|
|
1945
|
+
const finalName = count === 0 ? baseName : `${baseName}.${count}`;
|
|
1946
|
+
const destination = path5.join(filesRoot, finalName);
|
|
1947
|
+
await copyFile(absoluteSource, destination);
|
|
1948
|
+
const resolvedDestination = path5.resolve(destination);
|
|
1949
|
+
mirrored.push(resolvedDestination);
|
|
1950
|
+
if (guidelineOriginals.has(absoluteSource)) {
|
|
1951
|
+
guidelineMirrors.add(resolvedDestination);
|
|
1952
|
+
}
|
|
1953
|
+
}
|
|
1954
|
+
return {
|
|
1955
|
+
mirroredInputFiles: mirrored,
|
|
1956
|
+
guidelineMirrors
|
|
1957
|
+
};
|
|
1958
|
+
}
|
|
1959
|
+
async createWorkspace() {
|
|
1960
|
+
return await mkdtemp(path5.join(tmpdir(), WORKSPACE_PREFIX));
|
|
1961
|
+
}
|
|
1962
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
1963
|
+
try {
|
|
1964
|
+
await rm(workspaceRoot, { recursive: true, force: true });
|
|
1965
|
+
} catch {
|
|
1966
|
+
}
|
|
1967
|
+
}
|
|
1968
|
+
};
|
|
1969
|
+
async function locateExecutable(candidate) {
|
|
1970
|
+
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1971
|
+
if (includesPathSeparator) {
|
|
1972
|
+
const resolved = path5.isAbsolute(candidate) ? candidate : path5.resolve(candidate);
|
|
1973
|
+
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
1974
|
+
await access2(executablePath, constants2.F_OK);
|
|
1975
|
+
return executablePath;
|
|
1976
|
+
}
|
|
1977
|
+
const locator = process.platform === "win32" ? "where" : "which";
|
|
1978
|
+
try {
|
|
1979
|
+
const { stdout } = await execAsync2(`${locator} ${candidate}`);
|
|
1980
|
+
const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
1981
|
+
const preferred = selectExecutableCandidate(lines);
|
|
1982
|
+
if (preferred) {
|
|
1983
|
+
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
1984
|
+
await access2(executablePath, constants2.F_OK);
|
|
1985
|
+
return executablePath;
|
|
1986
|
+
}
|
|
1987
|
+
} catch {
|
|
1988
|
+
}
|
|
1989
|
+
throw new Error(`Codex executable '${candidate}' was not found on PATH`);
|
|
1990
|
+
}
|
|
1991
|
+
function selectExecutableCandidate(candidates) {
|
|
1992
|
+
if (candidates.length === 0) {
|
|
1993
|
+
return void 0;
|
|
1994
|
+
}
|
|
1995
|
+
if (process.platform !== "win32") {
|
|
1996
|
+
return candidates[0];
|
|
1997
|
+
}
|
|
1998
|
+
const extensions = getWindowsExecutableExtensions();
|
|
1999
|
+
for (const ext of extensions) {
|
|
2000
|
+
const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
|
|
2001
|
+
if (match) {
|
|
2002
|
+
return match;
|
|
2003
|
+
}
|
|
2004
|
+
}
|
|
2005
|
+
return candidates[0];
|
|
2006
|
+
}
|
|
2007
|
+
async function ensureWindowsExecutableVariant(candidate) {
|
|
2008
|
+
if (process.platform !== "win32") {
|
|
2009
|
+
return candidate;
|
|
2010
|
+
}
|
|
2011
|
+
if (hasExecutableExtension(candidate)) {
|
|
2012
|
+
return candidate;
|
|
2013
|
+
}
|
|
2014
|
+
const extensions = getWindowsExecutableExtensions();
|
|
2015
|
+
for (const ext of extensions) {
|
|
2016
|
+
const withExtension = `${candidate}${ext}`;
|
|
2017
|
+
try {
|
|
2018
|
+
await access2(withExtension, constants2.F_OK);
|
|
2019
|
+
return withExtension;
|
|
2020
|
+
} catch {
|
|
2021
|
+
}
|
|
2022
|
+
}
|
|
2023
|
+
return candidate;
|
|
2024
|
+
}
|
|
2025
|
+
function hasExecutableExtension(candidate) {
|
|
2026
|
+
const lower = candidate.toLowerCase();
|
|
2027
|
+
return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
|
|
2028
|
+
}
|
|
2029
|
+
var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
|
|
2030
|
+
function getWindowsExecutableExtensions() {
|
|
2031
|
+
if (process.platform !== "win32") {
|
|
2032
|
+
return [];
|
|
2033
|
+
}
|
|
2034
|
+
const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
|
|
2035
|
+
return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
|
|
2036
|
+
}
|
|
2037
|
+
function parseCodexJson(output) {
|
|
2038
|
+
const trimmed = output.trim();
|
|
2039
|
+
if (trimmed.length === 0) {
|
|
2040
|
+
throw new Error("Codex CLI produced no output in --json mode");
|
|
2041
|
+
}
|
|
2042
|
+
try {
|
|
2043
|
+
return JSON.parse(trimmed);
|
|
2044
|
+
} catch {
|
|
2045
|
+
const lineObjects = parseJsonLines(trimmed);
|
|
2046
|
+
if (lineObjects) {
|
|
2047
|
+
return lineObjects;
|
|
2048
|
+
}
|
|
2049
|
+
const lastBrace = trimmed.lastIndexOf("{");
|
|
2050
|
+
if (lastBrace >= 0) {
|
|
2051
|
+
const candidate = trimmed.slice(lastBrace);
|
|
2052
|
+
try {
|
|
2053
|
+
return JSON.parse(candidate);
|
|
2054
|
+
} catch {
|
|
2055
|
+
}
|
|
2056
|
+
}
|
|
2057
|
+
const preview = trimmed.slice(0, 200);
|
|
2058
|
+
throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
|
|
2059
|
+
}
|
|
2060
|
+
}
|
|
2061
|
+
function extractAssistantText(parsed) {
|
|
2062
|
+
if (Array.isArray(parsed)) {
|
|
2063
|
+
const text = extractFromEventStream(parsed);
|
|
2064
|
+
if (text) {
|
|
2065
|
+
return text;
|
|
2066
|
+
}
|
|
2067
|
+
}
|
|
2068
|
+
if (!parsed || typeof parsed !== "object") {
|
|
2069
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
2070
|
+
}
|
|
2071
|
+
const record = parsed;
|
|
2072
|
+
const eventText = extractFromEvent(record);
|
|
2073
|
+
if (eventText) {
|
|
2074
|
+
return eventText;
|
|
2075
|
+
}
|
|
2076
|
+
const messages = Array.isArray(record.messages) ? record.messages : void 0;
|
|
2077
|
+
if (messages) {
|
|
2078
|
+
for (let index = messages.length - 1; index >= 0; index -= 1) {
|
|
2079
|
+
const entry = messages[index];
|
|
2080
|
+
if (!entry || typeof entry !== "object") {
|
|
2081
|
+
continue;
|
|
2082
|
+
}
|
|
2083
|
+
const role = entry.role;
|
|
2084
|
+
if (role !== "assistant") {
|
|
2085
|
+
continue;
|
|
2086
|
+
}
|
|
2087
|
+
const content = entry.content;
|
|
2088
|
+
const flattened = flattenContent(content);
|
|
2089
|
+
if (flattened) {
|
|
2090
|
+
return flattened;
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2093
|
+
}
|
|
2094
|
+
const response = record.response;
|
|
2095
|
+
if (response && typeof response === "object") {
|
|
2096
|
+
const content = response.content;
|
|
2097
|
+
const flattened = flattenContent(content);
|
|
2098
|
+
if (flattened) {
|
|
2099
|
+
return flattened;
|
|
2100
|
+
}
|
|
2101
|
+
}
|
|
2102
|
+
const output = record.output;
|
|
2103
|
+
const flattenedOutput = flattenContent(output);
|
|
2104
|
+
if (flattenedOutput) {
|
|
2105
|
+
return flattenedOutput;
|
|
2106
|
+
}
|
|
2107
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
2108
|
+
}
|
|
2109
|
+
function extractFromEventStream(events) {
|
|
2110
|
+
for (let index = events.length - 1; index >= 0; index -= 1) {
|
|
2111
|
+
const candidate = events[index];
|
|
2112
|
+
const text = extractFromEvent(candidate);
|
|
2113
|
+
if (text) {
|
|
2114
|
+
return text;
|
|
2115
|
+
}
|
|
2116
|
+
}
|
|
2117
|
+
return void 0;
|
|
2118
|
+
}
|
|
2119
|
+
function extractFromEvent(event) {
|
|
2120
|
+
if (!event || typeof event !== "object") {
|
|
2121
|
+
return void 0;
|
|
2122
|
+
}
|
|
2123
|
+
const record = event;
|
|
2124
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
2125
|
+
if (type === JSONL_TYPE_ITEM_COMPLETED) {
|
|
2126
|
+
const item = record.item;
|
|
2127
|
+
const text = extractFromItem(item);
|
|
2128
|
+
if (text) {
|
|
2129
|
+
return text;
|
|
2130
|
+
}
|
|
2131
|
+
}
|
|
2132
|
+
const output = record.output ?? record.content;
|
|
2133
|
+
const flattened = flattenContent(output);
|
|
2134
|
+
if (flattened) {
|
|
2135
|
+
return flattened;
|
|
2136
|
+
}
|
|
2137
|
+
return void 0;
|
|
2138
|
+
}
|
|
2139
|
+
function extractFromItem(item) {
|
|
2140
|
+
if (!item || typeof item !== "object") {
|
|
2141
|
+
return void 0;
|
|
2142
|
+
}
|
|
2143
|
+
const record = item;
|
|
2144
|
+
const itemType = typeof record.type === "string" ? record.type : void 0;
|
|
2145
|
+
if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
|
|
2146
|
+
const text = flattenContent(record.text ?? record.content ?? record.output);
|
|
2147
|
+
if (text) {
|
|
2148
|
+
return text;
|
|
1154
2149
|
}
|
|
1155
2150
|
}
|
|
1156
|
-
return
|
|
2151
|
+
return void 0;
|
|
1157
2152
|
}
|
|
1158
|
-
function
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1162
|
-
return `file:///${normalizedPath}`;
|
|
2153
|
+
function flattenContent(value) {
|
|
2154
|
+
if (typeof value === "string") {
|
|
2155
|
+
return value;
|
|
1163
2156
|
}
|
|
1164
|
-
|
|
2157
|
+
if (Array.isArray(value)) {
|
|
2158
|
+
const parts = value.map((segment) => {
|
|
2159
|
+
if (typeof segment === "string") {
|
|
2160
|
+
return segment;
|
|
2161
|
+
}
|
|
2162
|
+
if (segment && typeof segment === "object" && "text" in segment) {
|
|
2163
|
+
const text = segment.text;
|
|
2164
|
+
return typeof text === "string" ? text : void 0;
|
|
2165
|
+
}
|
|
2166
|
+
return void 0;
|
|
2167
|
+
}).filter((part) => typeof part === "string" && part.length > 0);
|
|
2168
|
+
return parts.length > 0 ? parts.join(" \n") : void 0;
|
|
2169
|
+
}
|
|
2170
|
+
if (value && typeof value === "object" && "text" in value) {
|
|
2171
|
+
const text = value.text;
|
|
2172
|
+
return typeof text === "string" ? text : void 0;
|
|
2173
|
+
}
|
|
2174
|
+
return void 0;
|
|
1165
2175
|
}
|
|
1166
|
-
function
|
|
1167
|
-
|
|
2176
|
+
function parseJsonLines(output) {
|
|
2177
|
+
const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2178
|
+
if (lines.length <= 1) {
|
|
1168
2179
|
return void 0;
|
|
1169
2180
|
}
|
|
1170
|
-
const
|
|
1171
|
-
for (const
|
|
1172
|
-
|
|
2181
|
+
const parsed = [];
|
|
2182
|
+
for (const line of lines) {
|
|
2183
|
+
try {
|
|
2184
|
+
parsed.push(JSON.parse(line));
|
|
2185
|
+
} catch {
|
|
2186
|
+
return void 0;
|
|
2187
|
+
}
|
|
1173
2188
|
}
|
|
1174
|
-
return
|
|
2189
|
+
return parsed;
|
|
1175
2190
|
}
|
|
1176
|
-
function
|
|
1177
|
-
const
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
for (const attachment of list) {
|
|
1181
|
-
deduped.add(path2.resolve(attachment));
|
|
1182
|
-
}
|
|
2191
|
+
function pickDetail(stderr, stdout) {
|
|
2192
|
+
const errorText = stderr.trim();
|
|
2193
|
+
if (errorText.length > 0) {
|
|
2194
|
+
return errorText;
|
|
1183
2195
|
}
|
|
1184
|
-
|
|
2196
|
+
const stdoutText = stdout.trim();
|
|
2197
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
1185
2198
|
}
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
2199
|
+
function formatTimeoutSuffix2(timeoutMs) {
|
|
2200
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
2201
|
+
return "";
|
|
2202
|
+
}
|
|
2203
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
2204
|
+
return ` after ${seconds}s`;
|
|
2205
|
+
}
|
|
2206
|
+
async function defaultCodexRunner(options) {
|
|
2207
|
+
return await new Promise((resolve, reject) => {
|
|
2208
|
+
const child = spawn(options.executable, options.args, {
|
|
2209
|
+
cwd: options.cwd,
|
|
2210
|
+
env: options.env,
|
|
2211
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
2212
|
+
shell: shouldShellExecute(options.executable)
|
|
2213
|
+
});
|
|
2214
|
+
let stdout = "";
|
|
2215
|
+
let stderr = "";
|
|
2216
|
+
let timedOut = false;
|
|
2217
|
+
const onAbort = () => {
|
|
2218
|
+
child.kill("SIGTERM");
|
|
2219
|
+
};
|
|
2220
|
+
if (options.signal) {
|
|
2221
|
+
if (options.signal.aborted) {
|
|
2222
|
+
onAbort();
|
|
2223
|
+
} else {
|
|
2224
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
2225
|
+
}
|
|
1193
2226
|
}
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
2227
|
+
let timeoutHandle;
|
|
2228
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
2229
|
+
timeoutHandle = setTimeout(() => {
|
|
2230
|
+
timedOut = true;
|
|
2231
|
+
child.kill("SIGTERM");
|
|
2232
|
+
}, options.timeoutMs);
|
|
2233
|
+
timeoutHandle.unref?.();
|
|
2234
|
+
}
|
|
2235
|
+
child.stdout.setEncoding("utf8");
|
|
2236
|
+
child.stdout.on("data", (chunk) => {
|
|
2237
|
+
stdout += chunk;
|
|
1198
2238
|
});
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
2239
|
+
child.stderr.setEncoding("utf8");
|
|
2240
|
+
child.stderr.on("data", (chunk) => {
|
|
2241
|
+
stderr += chunk;
|
|
2242
|
+
});
|
|
2243
|
+
child.stdin.end(options.prompt);
|
|
2244
|
+
const cleanup = () => {
|
|
2245
|
+
if (timeoutHandle) {
|
|
2246
|
+
clearTimeout(timeoutHandle);
|
|
1202
2247
|
}
|
|
1203
|
-
if (
|
|
1204
|
-
|
|
2248
|
+
if (options.signal) {
|
|
2249
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
1205
2250
|
}
|
|
1206
|
-
console.log(`
|
|
1207
|
-
total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
|
|
1208
|
-
}
|
|
1209
|
-
return {
|
|
1210
|
-
provisioned: true,
|
|
1211
|
-
message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
|
|
1212
|
-
};
|
|
1213
|
-
} catch (error) {
|
|
1214
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1215
|
-
if (verbose) {
|
|
1216
|
-
console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
|
|
1217
|
-
}
|
|
1218
|
-
return {
|
|
1219
|
-
provisioned: false,
|
|
1220
|
-
message: `Provisioning failed: ${errorMessage}`
|
|
1221
2251
|
};
|
|
2252
|
+
child.on("error", (error) => {
|
|
2253
|
+
cleanup();
|
|
2254
|
+
reject(error);
|
|
2255
|
+
});
|
|
2256
|
+
child.on("close", (code) => {
|
|
2257
|
+
cleanup();
|
|
2258
|
+
resolve({
|
|
2259
|
+
stdout,
|
|
2260
|
+
stderr,
|
|
2261
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
2262
|
+
timedOut
|
|
2263
|
+
});
|
|
2264
|
+
});
|
|
2265
|
+
});
|
|
2266
|
+
}
|
|
2267
|
+
function shouldShellExecute(executable) {
|
|
2268
|
+
if (process.platform !== "win32") {
|
|
2269
|
+
return false;
|
|
1222
2270
|
}
|
|
2271
|
+
const lower = executable.toLowerCase();
|
|
2272
|
+
return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
|
|
1223
2273
|
}
|
|
1224
2274
|
|
|
1225
2275
|
// src/evaluation/providers/targets-file.ts
|
|
1226
|
-
import { constants as
|
|
1227
|
-
import { access as
|
|
1228
|
-
import
|
|
2276
|
+
import { constants as constants3 } from "node:fs";
|
|
2277
|
+
import { access as access3, readFile as readFile3 } from "node:fs/promises";
|
|
2278
|
+
import path6 from "node:path";
|
|
1229
2279
|
import { parse as parse2 } from "yaml";
|
|
1230
2280
|
function isRecord(value) {
|
|
1231
2281
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -1281,14 +2331,14 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
1281
2331
|
}
|
|
1282
2332
|
async function fileExists3(filePath) {
|
|
1283
2333
|
try {
|
|
1284
|
-
await
|
|
2334
|
+
await access3(filePath, constants3.F_OK);
|
|
1285
2335
|
return true;
|
|
1286
2336
|
} catch {
|
|
1287
2337
|
return false;
|
|
1288
2338
|
}
|
|
1289
2339
|
}
|
|
1290
2340
|
async function readTargetDefinitions(filePath) {
|
|
1291
|
-
const absolutePath =
|
|
2341
|
+
const absolutePath = path6.resolve(filePath);
|
|
1292
2342
|
if (!await fileExists3(absolutePath)) {
|
|
1293
2343
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
1294
2344
|
}
|
|
@@ -1315,6 +2365,10 @@ function createProvider(target) {
|
|
|
1315
2365
|
return new AnthropicProvider(target.name, target.config);
|
|
1316
2366
|
case "gemini":
|
|
1317
2367
|
return new GeminiProvider(target.name, target.config);
|
|
2368
|
+
case "cli":
|
|
2369
|
+
return new CliProvider(target.name, target.config);
|
|
2370
|
+
case "codex":
|
|
2371
|
+
return new CodexProvider(target.name, target.config);
|
|
1318
2372
|
case "mock":
|
|
1319
2373
|
return new MockProvider(target.name, target.config);
|
|
1320
2374
|
case "vscode":
|
|
@@ -1331,230 +2385,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
1331
2385
|
return createProvider(resolved);
|
|
1332
2386
|
}
|
|
1333
2387
|
|
|
1334
|
-
// src/evaluation/
|
|
1335
|
-
var KEY_TERM_MATCH_THRESHOLD = 0.5;
|
|
1336
|
-
var ACTION_WORDS = /* @__PURE__ */ new Set([
|
|
1337
|
-
"use",
|
|
1338
|
-
"avoid",
|
|
1339
|
-
"prefer",
|
|
1340
|
-
"replace",
|
|
1341
|
-
"consider",
|
|
1342
|
-
"ensure",
|
|
1343
|
-
"remove",
|
|
1344
|
-
"add"
|
|
1345
|
-
]);
|
|
1346
|
-
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
1347
|
-
"the",
|
|
1348
|
-
"a",
|
|
1349
|
-
"an",
|
|
1350
|
-
"and",
|
|
1351
|
-
"or",
|
|
1352
|
-
"but",
|
|
1353
|
-
"in",
|
|
1354
|
-
"on",
|
|
1355
|
-
"at",
|
|
1356
|
-
"to",
|
|
1357
|
-
"for",
|
|
1358
|
-
"of",
|
|
1359
|
-
"with",
|
|
1360
|
-
"by",
|
|
1361
|
-
"is",
|
|
1362
|
-
"are",
|
|
1363
|
-
"was",
|
|
1364
|
-
"were",
|
|
1365
|
-
"be",
|
|
1366
|
-
"been",
|
|
1367
|
-
"being",
|
|
1368
|
-
"have",
|
|
1369
|
-
"has",
|
|
1370
|
-
"had",
|
|
1371
|
-
"do",
|
|
1372
|
-
"does",
|
|
1373
|
-
"did",
|
|
1374
|
-
"will",
|
|
1375
|
-
"would",
|
|
1376
|
-
"could",
|
|
1377
|
-
"should"
|
|
1378
|
-
]);
|
|
1379
|
-
var ERROR_PREFIXES = [
|
|
1380
|
-
"error:",
|
|
1381
|
-
"err:",
|
|
1382
|
-
"vs code command failed",
|
|
1383
|
-
"exception",
|
|
1384
|
-
"traceback",
|
|
1385
|
-
"no response file was generated",
|
|
1386
|
-
"timed out",
|
|
1387
|
-
"cli not found"
|
|
1388
|
-
];
|
|
1389
|
-
function extractAspects(expectedResponse) {
|
|
1390
|
-
const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
|
|
1391
|
-
const aspects = [];
|
|
1392
|
-
for (const line of lines) {
|
|
1393
|
-
if (line.length === 0) {
|
|
1394
|
-
continue;
|
|
1395
|
-
}
|
|
1396
|
-
const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
|
|
1397
|
-
if (bulletMatch) {
|
|
1398
|
-
const normalized = normalizeAspect(bulletMatch[2]);
|
|
1399
|
-
if (normalized.length > 0) {
|
|
1400
|
-
aspects.push(normalized);
|
|
1401
|
-
}
|
|
1402
|
-
continue;
|
|
1403
|
-
}
|
|
1404
|
-
const lowered = line.toLowerCase();
|
|
1405
|
-
if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
|
|
1406
|
-
const normalized = normalizeAspect(line);
|
|
1407
|
-
if (normalized.length > 0) {
|
|
1408
|
-
aspects.push(normalized);
|
|
1409
|
-
}
|
|
1410
|
-
}
|
|
1411
|
-
}
|
|
1412
|
-
return aspects;
|
|
1413
|
-
}
|
|
1414
|
-
function calculateHits(candidateResponse, expectedAspects) {
|
|
1415
|
-
const { normalizedText, words } = normalizeCandidate(candidateResponse);
|
|
1416
|
-
const hits = [];
|
|
1417
|
-
for (const aspect of expectedAspects) {
|
|
1418
|
-
if (matchesAspect(aspect, normalizedText, words)) {
|
|
1419
|
-
hits.push(aspect);
|
|
1420
|
-
}
|
|
1421
|
-
}
|
|
1422
|
-
return hits;
|
|
1423
|
-
}
|
|
1424
|
-
function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
|
|
1425
|
-
const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
|
|
1426
|
-
return expectedAspects.filter((aspect) => !hits.has(aspect));
|
|
1427
|
-
}
|
|
1428
|
-
function scoreCandidateResponse(candidateResponse, expectedAspects) {
|
|
1429
|
-
if (expectedAspects.length === 0) {
|
|
1430
|
-
if (isErrorLike(candidateResponse)) {
|
|
1431
|
-
return {
|
|
1432
|
-
score: 0,
|
|
1433
|
-
hits: [],
|
|
1434
|
-
misses: ["Model produced an error instead of an answer."],
|
|
1435
|
-
hitCount: 0,
|
|
1436
|
-
totalAspects: 0,
|
|
1437
|
-
rawAspects: []
|
|
1438
|
-
};
|
|
1439
|
-
}
|
|
1440
|
-
return {
|
|
1441
|
-
score: 1,
|
|
1442
|
-
hits: [],
|
|
1443
|
-
misses: [],
|
|
1444
|
-
hitCount: 0,
|
|
1445
|
-
totalAspects: 0,
|
|
1446
|
-
rawAspects: []
|
|
1447
|
-
};
|
|
1448
|
-
}
|
|
1449
|
-
const hits = calculateHits(candidateResponse, expectedAspects);
|
|
1450
|
-
const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
|
|
1451
|
-
const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
|
|
1452
|
-
return {
|
|
1453
|
-
score,
|
|
1454
|
-
hits,
|
|
1455
|
-
misses,
|
|
1456
|
-
hitCount: hits.length,
|
|
1457
|
-
totalAspects: expectedAspects.length,
|
|
1458
|
-
rawAspects: expectedAspects
|
|
1459
|
-
};
|
|
1460
|
-
}
|
|
1461
|
-
function isErrorLike(text) {
|
|
1462
|
-
if (!text) {
|
|
1463
|
-
return false;
|
|
1464
|
-
}
|
|
1465
|
-
const lowered = text.trim().toLowerCase();
|
|
1466
|
-
return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
|
|
1467
|
-
}
|
|
1468
|
-
function normalizeAspect(aspect) {
|
|
1469
|
-
const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
1470
|
-
return sanitized;
|
|
1471
|
-
}
|
|
1472
|
-
function normalizeCandidate(candidate) {
|
|
1473
|
-
const lowered = candidate.toLowerCase();
|
|
1474
|
-
const normalizedText = lowered.replace(/[^\w\s]/g, " ");
|
|
1475
|
-
const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
|
|
1476
|
-
return { normalizedText, words };
|
|
1477
|
-
}
|
|
1478
|
-
function matchesAspect(aspect, candidateNormalized, candidateWords) {
|
|
1479
|
-
const keyTerms = extractKeyTerms(aspect);
|
|
1480
|
-
if (keyTerms.length === 0) {
|
|
1481
|
-
return false;
|
|
1482
|
-
}
|
|
1483
|
-
const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
|
|
1484
|
-
const ratio = matches / keyTerms.length;
|
|
1485
|
-
if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
|
|
1486
|
-
return true;
|
|
1487
|
-
}
|
|
1488
|
-
const aspectWords = aspect.split(" ");
|
|
1489
|
-
if (aspectWords.length >= 2) {
|
|
1490
|
-
for (let index = 0; index < aspectWords.length - 1; index += 1) {
|
|
1491
|
-
const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
|
|
1492
|
-
if (candidateNormalized.includes(phrase)) {
|
|
1493
|
-
return true;
|
|
1494
|
-
}
|
|
1495
|
-
}
|
|
1496
|
-
}
|
|
1497
|
-
return false;
|
|
1498
|
-
}
|
|
1499
|
-
function extractKeyTerms(aspect, maxTerms = 5) {
|
|
1500
|
-
const terms = [];
|
|
1501
|
-
const words = aspect.split(" ");
|
|
1502
|
-
for (const word of words) {
|
|
1503
|
-
if (word.length <= 2) {
|
|
1504
|
-
continue;
|
|
1505
|
-
}
|
|
1506
|
-
if (STOP_WORDS.has(word)) {
|
|
1507
|
-
continue;
|
|
1508
|
-
}
|
|
1509
|
-
terms.push(word);
|
|
1510
|
-
if (terms.length >= maxTerms) {
|
|
1511
|
-
break;
|
|
1512
|
-
}
|
|
1513
|
-
}
|
|
1514
|
-
return terms;
|
|
1515
|
-
}
|
|
1516
|
-
|
|
1517
|
-
// src/evaluation/grading.ts
|
|
2388
|
+
// src/evaluation/evaluators.ts
|
|
1518
2389
|
import { randomUUID } from "node:crypto";
|
|
1519
|
-
var
|
|
1520
|
-
kind = "heuristic";
|
|
1521
|
-
grade(context) {
|
|
1522
|
-
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1523
|
-
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1524
|
-
const misses = [...result.misses];
|
|
1525
|
-
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
1526
|
-
const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
|
|
1527
|
-
if (firstLine && !misses.includes(firstLine)) {
|
|
1528
|
-
misses.unshift(firstLine);
|
|
1529
|
-
}
|
|
1530
|
-
}
|
|
1531
|
-
return {
|
|
1532
|
-
score: result.score,
|
|
1533
|
-
hits: result.hits,
|
|
1534
|
-
misses,
|
|
1535
|
-
expectedAspectCount: result.totalAspects,
|
|
1536
|
-
rawAspects: result.rawAspects
|
|
1537
|
-
};
|
|
1538
|
-
}
|
|
1539
|
-
};
|
|
1540
|
-
var QualityGrader = class {
|
|
2390
|
+
var LlmJudgeEvaluator = class {
|
|
1541
2391
|
kind = "llm_judge";
|
|
1542
2392
|
resolveJudgeProvider;
|
|
1543
2393
|
maxOutputTokens;
|
|
1544
2394
|
temperature;
|
|
2395
|
+
customPrompt;
|
|
1545
2396
|
constructor(options) {
|
|
1546
2397
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
1547
2398
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
1548
2399
|
this.temperature = options.temperature;
|
|
2400
|
+
this.customPrompt = options.customPrompt;
|
|
1549
2401
|
}
|
|
1550
|
-
async
|
|
2402
|
+
async evaluate(context) {
|
|
1551
2403
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
1552
2404
|
if (!judgeProvider) {
|
|
1553
2405
|
throw new Error("No judge provider available for LLM grading");
|
|
1554
2406
|
}
|
|
1555
2407
|
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2408
|
+
const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
1556
2409
|
const metadata = {
|
|
1557
|
-
systemPrompt:
|
|
2410
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2411
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1558
2412
|
};
|
|
1559
2413
|
const response = await judgeProvider.invoke({
|
|
1560
2414
|
prompt,
|
|
@@ -1569,12 +2423,13 @@ var QualityGrader = class {
|
|
|
1569
2423
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1570
2424
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1571
2425
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
1572
|
-
const
|
|
2426
|
+
const evaluatorRawRequest = {
|
|
1573
2427
|
id: randomUUID(),
|
|
1574
2428
|
provider: judgeProvider.id,
|
|
1575
2429
|
prompt,
|
|
1576
|
-
|
|
1577
|
-
|
|
2430
|
+
target: context.target.name,
|
|
2431
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2432
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1578
2433
|
};
|
|
1579
2434
|
return {
|
|
1580
2435
|
score,
|
|
@@ -1582,7 +2437,7 @@ var QualityGrader = class {
|
|
|
1582
2437
|
misses,
|
|
1583
2438
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
1584
2439
|
reasoning,
|
|
1585
|
-
|
|
2440
|
+
evaluatorRawRequest
|
|
1586
2441
|
};
|
|
1587
2442
|
}
|
|
1588
2443
|
};
|
|
@@ -1700,11 +2555,117 @@ function extractJsonBlob(text) {
|
|
|
1700
2555
|
function isNonEmptyString(value) {
|
|
1701
2556
|
return typeof value === "string" && value.trim().length > 0;
|
|
1702
2557
|
}
|
|
2558
|
+
var CodeEvaluator = class {
|
|
2559
|
+
kind = "code";
|
|
2560
|
+
script;
|
|
2561
|
+
cwd;
|
|
2562
|
+
agentTimeoutMs;
|
|
2563
|
+
constructor(options) {
|
|
2564
|
+
this.script = options.script;
|
|
2565
|
+
this.cwd = options.cwd;
|
|
2566
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
2567
|
+
}
|
|
2568
|
+
async evaluate(context) {
|
|
2569
|
+
const inputPayload = JSON.stringify(
|
|
2570
|
+
{
|
|
2571
|
+
task: context.evalCase.task,
|
|
2572
|
+
outcome: context.evalCase.outcome,
|
|
2573
|
+
expected: context.evalCase.expected_assistant_raw,
|
|
2574
|
+
output: context.candidate,
|
|
2575
|
+
system_message: context.promptInputs.systemMessage ?? "",
|
|
2576
|
+
guideline_paths: context.evalCase.guideline_paths,
|
|
2577
|
+
attachments: context.evalCase.file_paths,
|
|
2578
|
+
user_segments: context.evalCase.user_segments
|
|
2579
|
+
},
|
|
2580
|
+
null,
|
|
2581
|
+
2
|
|
2582
|
+
);
|
|
2583
|
+
try {
|
|
2584
|
+
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
2585
|
+
const parsed = parseJsonSafe(stdout);
|
|
2586
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
2587
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
2588
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
2589
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
2590
|
+
return {
|
|
2591
|
+
score,
|
|
2592
|
+
hits,
|
|
2593
|
+
misses,
|
|
2594
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
2595
|
+
reasoning,
|
|
2596
|
+
evaluatorRawRequest: {
|
|
2597
|
+
script: this.script,
|
|
2598
|
+
...this.cwd ? { cwd: this.cwd } : {}
|
|
2599
|
+
}
|
|
2600
|
+
};
|
|
2601
|
+
} catch (error) {
|
|
2602
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2603
|
+
return {
|
|
2604
|
+
score: 0,
|
|
2605
|
+
hits: [],
|
|
2606
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
2607
|
+
expectedAspectCount: 1,
|
|
2608
|
+
reasoning: message,
|
|
2609
|
+
evaluatorRawRequest: {
|
|
2610
|
+
script: this.script,
|
|
2611
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
2612
|
+
error: message
|
|
2613
|
+
}
|
|
2614
|
+
};
|
|
2615
|
+
}
|
|
2616
|
+
}
|
|
2617
|
+
};
|
|
2618
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
2619
|
+
const { spawn: spawn2 } = await import("node:child_process");
|
|
2620
|
+
return await new Promise((resolve, reject) => {
|
|
2621
|
+
const child = spawn2(scriptPath, {
|
|
2622
|
+
shell: true,
|
|
2623
|
+
cwd
|
|
2624
|
+
});
|
|
2625
|
+
let stdout = "";
|
|
2626
|
+
let stderr = "";
|
|
2627
|
+
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
2628
|
+
child.kill();
|
|
2629
|
+
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
2630
|
+
}, agentTimeoutMs) : void 0;
|
|
2631
|
+
child.stdout?.on("data", (data) => {
|
|
2632
|
+
stdout += data.toString();
|
|
2633
|
+
});
|
|
2634
|
+
child.stderr?.on("data", (data) => {
|
|
2635
|
+
stderr += data.toString();
|
|
2636
|
+
});
|
|
2637
|
+
child.on("error", (error) => {
|
|
2638
|
+
if (timeout !== void 0) {
|
|
2639
|
+
clearTimeout(timeout);
|
|
2640
|
+
}
|
|
2641
|
+
reject(error);
|
|
2642
|
+
});
|
|
2643
|
+
child.on("exit", (code) => {
|
|
2644
|
+
if (timeout !== void 0) {
|
|
2645
|
+
clearTimeout(timeout);
|
|
2646
|
+
}
|
|
2647
|
+
if (code && code !== 0 && stderr.length > 0) {
|
|
2648
|
+
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
2649
|
+
return;
|
|
2650
|
+
}
|
|
2651
|
+
resolve(stdout.trim());
|
|
2652
|
+
});
|
|
2653
|
+
child.stdin?.write(input);
|
|
2654
|
+
child.stdin?.end();
|
|
2655
|
+
});
|
|
2656
|
+
}
|
|
2657
|
+
function parseJsonSafe(payload) {
|
|
2658
|
+
try {
|
|
2659
|
+
return JSON.parse(payload);
|
|
2660
|
+
} catch {
|
|
2661
|
+
return void 0;
|
|
2662
|
+
}
|
|
2663
|
+
}
|
|
1703
2664
|
|
|
1704
2665
|
// src/evaluation/orchestrator.ts
|
|
1705
2666
|
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
1706
|
-
import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
|
|
1707
|
-
import
|
|
2667
|
+
import { mkdir as mkdir2, readFile as readFile4, writeFile as writeFile2 } from "node:fs/promises";
|
|
2668
|
+
import path7 from "node:path";
|
|
1708
2669
|
|
|
1709
2670
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
1710
2671
|
var Node = class {
|
|
@@ -1851,7 +2812,7 @@ async function runEvaluation(options) {
|
|
|
1851
2812
|
targets,
|
|
1852
2813
|
env,
|
|
1853
2814
|
providerFactory,
|
|
1854
|
-
|
|
2815
|
+
evaluators,
|
|
1855
2816
|
maxRetries,
|
|
1856
2817
|
agentTimeoutMs,
|
|
1857
2818
|
promptDumpDir,
|
|
@@ -1910,7 +2871,7 @@ async function runEvaluation(options) {
|
|
|
1910
2871
|
}
|
|
1911
2872
|
return getOrCreateProvider(resolvedJudge);
|
|
1912
2873
|
};
|
|
1913
|
-
const
|
|
2874
|
+
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
1914
2875
|
const primaryProvider = getOrCreateProvider(target);
|
|
1915
2876
|
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
1916
2877
|
if (target.providerBatching && !providerSupportsBatch && verbose) {
|
|
@@ -1933,13 +2894,14 @@ async function runEvaluation(options) {
|
|
|
1933
2894
|
evalCases: filteredEvalCases,
|
|
1934
2895
|
provider: primaryProvider,
|
|
1935
2896
|
target,
|
|
1936
|
-
|
|
2897
|
+
evaluatorRegistry,
|
|
1937
2898
|
promptDumpDir,
|
|
1938
2899
|
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
1939
2900
|
onProgress,
|
|
1940
2901
|
onResult,
|
|
1941
2902
|
verbose,
|
|
1942
|
-
resolveJudgeProvider
|
|
2903
|
+
resolveJudgeProvider,
|
|
2904
|
+
agentTimeoutMs
|
|
1943
2905
|
});
|
|
1944
2906
|
} catch (error) {
|
|
1945
2907
|
if (verbose) {
|
|
@@ -1970,7 +2932,7 @@ async function runEvaluation(options) {
|
|
|
1970
2932
|
evalCase,
|
|
1971
2933
|
provider: primaryProvider,
|
|
1972
2934
|
target,
|
|
1973
|
-
|
|
2935
|
+
evaluators: evaluatorRegistry,
|
|
1974
2936
|
maxRetries,
|
|
1975
2937
|
agentTimeoutMs,
|
|
1976
2938
|
promptDumpDir,
|
|
@@ -2036,12 +2998,13 @@ async function runBatchEvaluation(options) {
|
|
|
2036
2998
|
evalCases,
|
|
2037
2999
|
provider,
|
|
2038
3000
|
target,
|
|
2039
|
-
|
|
3001
|
+
evaluatorRegistry,
|
|
2040
3002
|
promptDumpDir,
|
|
2041
3003
|
nowFn,
|
|
2042
3004
|
onProgress,
|
|
2043
3005
|
onResult,
|
|
2044
|
-
resolveJudgeProvider
|
|
3006
|
+
resolveJudgeProvider,
|
|
3007
|
+
agentTimeoutMs
|
|
2045
3008
|
} = options;
|
|
2046
3009
|
const promptInputsList = [];
|
|
2047
3010
|
for (const evalCase of evalCases) {
|
|
@@ -2057,7 +3020,7 @@ async function runBatchEvaluation(options) {
|
|
|
2057
3020
|
prompt: promptInputs.request,
|
|
2058
3021
|
guidelines: promptInputs.guidelines,
|
|
2059
3022
|
guideline_patterns: evalCase.guideline_patterns,
|
|
2060
|
-
|
|
3023
|
+
inputFiles: evalCase.file_paths,
|
|
2061
3024
|
evalCaseId: evalCase.id,
|
|
2062
3025
|
metadata: {
|
|
2063
3026
|
systemPrompt: promptInputs.systemMessage ?? ""
|
|
@@ -2089,23 +3052,19 @@ async function runBatchEvaluation(options) {
|
|
|
2089
3052
|
const evalCase = evalCases[i];
|
|
2090
3053
|
const promptInputs = promptInputsList[i];
|
|
2091
3054
|
const providerResponse = batchResponse[i];
|
|
2092
|
-
|
|
2093
|
-
const graderKind = evalCase.grader ?? "heuristic";
|
|
2094
|
-
const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
|
|
2095
|
-
if (!activeGrader) {
|
|
2096
|
-
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2097
|
-
}
|
|
2098
|
-
let grade;
|
|
3055
|
+
let result;
|
|
2099
3056
|
try {
|
|
2100
|
-
|
|
3057
|
+
result = await evaluateCandidate({
|
|
2101
3058
|
evalCase,
|
|
2102
3059
|
candidate: providerResponse.text ?? "",
|
|
2103
3060
|
target,
|
|
2104
3061
|
provider,
|
|
2105
|
-
|
|
3062
|
+
evaluators: evaluatorRegistry,
|
|
2106
3063
|
promptInputs,
|
|
2107
|
-
|
|
2108
|
-
|
|
3064
|
+
nowFn,
|
|
3065
|
+
attempt: 0,
|
|
3066
|
+
judgeProvider: await resolveJudgeProvider(target),
|
|
3067
|
+
agentTimeoutMs
|
|
2109
3068
|
});
|
|
2110
3069
|
} catch (error) {
|
|
2111
3070
|
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
@@ -2124,28 +3083,6 @@ async function runBatchEvaluation(options) {
|
|
|
2124
3083
|
}
|
|
2125
3084
|
continue;
|
|
2126
3085
|
}
|
|
2127
|
-
const completedAt = nowFn();
|
|
2128
|
-
const rawRequest = {
|
|
2129
|
-
request: promptInputs.request,
|
|
2130
|
-
guidelines: promptInputs.guidelines,
|
|
2131
|
-
guideline_paths: evalCase.guideline_paths,
|
|
2132
|
-
system_message: promptInputs.systemMessage ?? ""
|
|
2133
|
-
};
|
|
2134
|
-
const result = {
|
|
2135
|
-
eval_id: evalCase.id,
|
|
2136
|
-
conversation_id: evalCase.conversation_id,
|
|
2137
|
-
score: grade.score,
|
|
2138
|
-
hits: grade.hits,
|
|
2139
|
-
misses: grade.misses,
|
|
2140
|
-
model_answer: providerResponse.text ?? "",
|
|
2141
|
-
expected_aspect_count: grade.expectedAspectCount,
|
|
2142
|
-
target: target.name,
|
|
2143
|
-
timestamp: completedAt.toISOString(),
|
|
2144
|
-
reasoning: grade.reasoning,
|
|
2145
|
-
raw_aspects: grade.rawAspects,
|
|
2146
|
-
raw_request: rawRequest,
|
|
2147
|
-
grader_raw_request: grade.graderRawRequest
|
|
2148
|
-
};
|
|
2149
3086
|
results.push(result);
|
|
2150
3087
|
if (onResult) {
|
|
2151
3088
|
await onResult(result);
|
|
@@ -2167,7 +3104,7 @@ async function runEvalCase(options) {
|
|
|
2167
3104
|
evalCase,
|
|
2168
3105
|
provider,
|
|
2169
3106
|
target,
|
|
2170
|
-
|
|
3107
|
+
evaluators,
|
|
2171
3108
|
now,
|
|
2172
3109
|
maxRetries,
|
|
2173
3110
|
agentTimeoutMs,
|
|
@@ -2222,27 +3159,49 @@ async function runEvalCase(options) {
|
|
|
2222
3159
|
if (cacheKey && cache && !cachedResponse) {
|
|
2223
3160
|
await cache.set(cacheKey, providerResponse);
|
|
2224
3161
|
}
|
|
2225
|
-
const graderKind = evalCase.grader ?? "heuristic";
|
|
2226
|
-
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
2227
|
-
if (!activeGrader) {
|
|
2228
|
-
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2229
|
-
}
|
|
2230
|
-
let grade;
|
|
2231
3162
|
try {
|
|
2232
|
-
|
|
2233
|
-
grade = await activeGrader.grade({
|
|
3163
|
+
return await evaluateCandidate({
|
|
2234
3164
|
evalCase,
|
|
2235
3165
|
candidate: providerResponse.text ?? "",
|
|
2236
3166
|
target,
|
|
2237
3167
|
provider,
|
|
2238
|
-
|
|
3168
|
+
evaluators,
|
|
2239
3169
|
promptInputs,
|
|
2240
|
-
|
|
2241
|
-
|
|
3170
|
+
nowFn,
|
|
3171
|
+
attempt,
|
|
3172
|
+
judgeProvider,
|
|
3173
|
+
agentTimeoutMs
|
|
2242
3174
|
});
|
|
2243
3175
|
} catch (error) {
|
|
2244
3176
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2245
3177
|
}
|
|
3178
|
+
}
|
|
3179
|
+
async function evaluateCandidate(options) {
|
|
3180
|
+
const {
|
|
3181
|
+
evalCase,
|
|
3182
|
+
candidate,
|
|
3183
|
+
target,
|
|
3184
|
+
provider,
|
|
3185
|
+
evaluators,
|
|
3186
|
+
promptInputs,
|
|
3187
|
+
nowFn,
|
|
3188
|
+
attempt,
|
|
3189
|
+
judgeProvider,
|
|
3190
|
+
agentTimeoutMs
|
|
3191
|
+
} = options;
|
|
3192
|
+
const gradeTimestamp = nowFn();
|
|
3193
|
+
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
3194
|
+
evalCase,
|
|
3195
|
+
candidate,
|
|
3196
|
+
target,
|
|
3197
|
+
provider,
|
|
3198
|
+
evaluators,
|
|
3199
|
+
attempt,
|
|
3200
|
+
promptInputs,
|
|
3201
|
+
now: gradeTimestamp,
|
|
3202
|
+
judgeProvider,
|
|
3203
|
+
agentTimeoutMs
|
|
3204
|
+
});
|
|
2246
3205
|
const completedAt = nowFn();
|
|
2247
3206
|
const rawRequest = {
|
|
2248
3207
|
request: promptInputs.request,
|
|
@@ -2253,28 +3212,200 @@ async function runEvalCase(options) {
|
|
|
2253
3212
|
return {
|
|
2254
3213
|
eval_id: evalCase.id,
|
|
2255
3214
|
conversation_id: evalCase.conversation_id,
|
|
2256
|
-
score:
|
|
2257
|
-
hits:
|
|
2258
|
-
misses:
|
|
2259
|
-
model_answer:
|
|
2260
|
-
expected_aspect_count:
|
|
3215
|
+
score: score.score,
|
|
3216
|
+
hits: score.hits,
|
|
3217
|
+
misses: score.misses,
|
|
3218
|
+
model_answer: candidate,
|
|
3219
|
+
expected_aspect_count: score.expectedAspectCount,
|
|
2261
3220
|
target: target.name,
|
|
2262
3221
|
timestamp: completedAt.toISOString(),
|
|
2263
|
-
reasoning:
|
|
2264
|
-
raw_aspects:
|
|
3222
|
+
reasoning: score.reasoning,
|
|
3223
|
+
raw_aspects: score.rawAspects,
|
|
2265
3224
|
raw_request: rawRequest,
|
|
2266
|
-
|
|
3225
|
+
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3226
|
+
evaluator_results: evaluatorResults
|
|
2267
3227
|
};
|
|
2268
3228
|
}
|
|
3229
|
+
async function runEvaluatorsForCase(options) {
|
|
3230
|
+
const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
|
|
3231
|
+
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
3232
|
+
return runEvaluatorList({
|
|
3233
|
+
evalCase,
|
|
3234
|
+
evaluators: evalCase.evaluators,
|
|
3235
|
+
candidate,
|
|
3236
|
+
target,
|
|
3237
|
+
provider,
|
|
3238
|
+
evaluatorRegistry: evaluators,
|
|
3239
|
+
attempt,
|
|
3240
|
+
promptInputs,
|
|
3241
|
+
now,
|
|
3242
|
+
judgeProvider,
|
|
3243
|
+
agentTimeoutMs
|
|
3244
|
+
});
|
|
3245
|
+
}
|
|
3246
|
+
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
3247
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
|
|
3248
|
+
if (!activeEvaluator) {
|
|
3249
|
+
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
3250
|
+
}
|
|
3251
|
+
const score = await activeEvaluator.evaluate({
|
|
3252
|
+
evalCase,
|
|
3253
|
+
candidate,
|
|
3254
|
+
target,
|
|
3255
|
+
provider,
|
|
3256
|
+
attempt,
|
|
3257
|
+
promptInputs,
|
|
3258
|
+
now,
|
|
3259
|
+
judgeProvider
|
|
3260
|
+
});
|
|
3261
|
+
return { score };
|
|
3262
|
+
}
|
|
3263
|
+
async function runEvaluatorList(options) {
|
|
3264
|
+
const {
|
|
3265
|
+
evalCase,
|
|
3266
|
+
evaluators,
|
|
3267
|
+
candidate,
|
|
3268
|
+
target,
|
|
3269
|
+
provider,
|
|
3270
|
+
evaluatorRegistry,
|
|
3271
|
+
attempt,
|
|
3272
|
+
promptInputs,
|
|
3273
|
+
now,
|
|
3274
|
+
judgeProvider,
|
|
3275
|
+
agentTimeoutMs
|
|
3276
|
+
} = options;
|
|
3277
|
+
const scored = [];
|
|
3278
|
+
const evaluatorResults = [];
|
|
3279
|
+
for (const evaluator of evaluators ?? []) {
|
|
3280
|
+
try {
|
|
3281
|
+
if (evaluator.type === "llm_judge") {
|
|
3282
|
+
const score2 = await runLlmJudgeEvaluator({
|
|
3283
|
+
config: evaluator,
|
|
3284
|
+
evalCase,
|
|
3285
|
+
candidate,
|
|
3286
|
+
target,
|
|
3287
|
+
provider,
|
|
3288
|
+
evaluatorRegistry,
|
|
3289
|
+
attempt,
|
|
3290
|
+
promptInputs,
|
|
3291
|
+
now,
|
|
3292
|
+
judgeProvider
|
|
3293
|
+
});
|
|
3294
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3295
|
+
evaluatorResults.push({
|
|
3296
|
+
name: evaluator.name,
|
|
3297
|
+
type: evaluator.type,
|
|
3298
|
+
score: score2.score,
|
|
3299
|
+
hits: score2.hits,
|
|
3300
|
+
misses: score2.misses,
|
|
3301
|
+
reasoning: score2.reasoning,
|
|
3302
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3303
|
+
});
|
|
3304
|
+
continue;
|
|
3305
|
+
}
|
|
3306
|
+
if (evaluator.type === "code") {
|
|
3307
|
+
const codeEvaluator = new CodeEvaluator({
|
|
3308
|
+
script: evaluator.script,
|
|
3309
|
+
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
3310
|
+
agentTimeoutMs
|
|
3311
|
+
});
|
|
3312
|
+
const score2 = await codeEvaluator.evaluate({
|
|
3313
|
+
evalCase,
|
|
3314
|
+
candidate,
|
|
3315
|
+
target,
|
|
3316
|
+
provider,
|
|
3317
|
+
attempt,
|
|
3318
|
+
promptInputs,
|
|
3319
|
+
now
|
|
3320
|
+
});
|
|
3321
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3322
|
+
evaluatorResults.push({
|
|
3323
|
+
name: evaluator.name,
|
|
3324
|
+
type: evaluator.type,
|
|
3325
|
+
score: score2.score,
|
|
3326
|
+
hits: score2.hits,
|
|
3327
|
+
misses: score2.misses,
|
|
3328
|
+
reasoning: score2.reasoning,
|
|
3329
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3330
|
+
});
|
|
3331
|
+
continue;
|
|
3332
|
+
}
|
|
3333
|
+
} catch (error) {
|
|
3334
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3335
|
+
const fallbackScore = {
|
|
3336
|
+
score: 0,
|
|
3337
|
+
hits: [],
|
|
3338
|
+
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
3339
|
+
expectedAspectCount: 1,
|
|
3340
|
+
reasoning: message
|
|
3341
|
+
};
|
|
3342
|
+
scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
|
|
3343
|
+
evaluatorResults.push({
|
|
3344
|
+
name: evaluator.name ?? "unknown",
|
|
3345
|
+
type: evaluator.type ?? "unknown",
|
|
3346
|
+
score: 0,
|
|
3347
|
+
hits: [],
|
|
3348
|
+
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
3349
|
+
reasoning: message
|
|
3350
|
+
});
|
|
3351
|
+
}
|
|
3352
|
+
}
|
|
3353
|
+
const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
|
|
3354
|
+
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
3355
|
+
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
3356
|
+
const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
|
|
3357
|
+
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
3358
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
3359
|
+
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
3360
|
+
const score = {
|
|
3361
|
+
score: aggregateScore,
|
|
3362
|
+
hits,
|
|
3363
|
+
misses,
|
|
3364
|
+
expectedAspectCount,
|
|
3365
|
+
reasoning,
|
|
3366
|
+
rawAspects: rawAspects.length > 0 ? rawAspects : void 0
|
|
3367
|
+
};
|
|
3368
|
+
return { score, evaluatorResults };
|
|
3369
|
+
}
|
|
3370
|
+
async function runLlmJudgeEvaluator(options) {
|
|
3371
|
+
const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
|
|
3372
|
+
const customPrompt = await resolveCustomPrompt(config);
|
|
3373
|
+
return evaluatorRegistry.llm_judge.evaluate({
|
|
3374
|
+
evalCase,
|
|
3375
|
+
candidate,
|
|
3376
|
+
target,
|
|
3377
|
+
provider,
|
|
3378
|
+
attempt,
|
|
3379
|
+
promptInputs,
|
|
3380
|
+
now,
|
|
3381
|
+
judgeProvider,
|
|
3382
|
+
systemPrompt: customPrompt,
|
|
3383
|
+
evaluator: config,
|
|
3384
|
+
judgeModel: config.model
|
|
3385
|
+
});
|
|
3386
|
+
}
|
|
3387
|
+
async function resolveCustomPrompt(config) {
|
|
3388
|
+
if (config.promptPath) {
|
|
3389
|
+
try {
|
|
3390
|
+
return await readFile4(config.promptPath, "utf8");
|
|
3391
|
+
} catch (error) {
|
|
3392
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3393
|
+
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
3394
|
+
}
|
|
3395
|
+
}
|
|
3396
|
+
return config.prompt;
|
|
3397
|
+
}
|
|
3398
|
+
function isNonEmptyString2(value) {
|
|
3399
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
3400
|
+
}
|
|
2269
3401
|
function filterEvalCases(evalCases, evalId) {
|
|
2270
3402
|
if (!evalId) {
|
|
2271
3403
|
return evalCases;
|
|
2272
3404
|
}
|
|
2273
3405
|
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
2274
3406
|
}
|
|
2275
|
-
function
|
|
2276
|
-
const
|
|
2277
|
-
const llmJudge = overrides?.llm_judge ?? new QualityGrader({
|
|
3407
|
+
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
3408
|
+
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
2278
3409
|
resolveJudgeProvider: async (context) => {
|
|
2279
3410
|
if (context.judgeProvider) {
|
|
2280
3411
|
return context.judgeProvider;
|
|
@@ -2284,15 +3415,14 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
2284
3415
|
});
|
|
2285
3416
|
return {
|
|
2286
3417
|
...overrides,
|
|
2287
|
-
heuristic,
|
|
2288
3418
|
llm_judge: llmJudge
|
|
2289
3419
|
};
|
|
2290
3420
|
}
|
|
2291
3421
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
2292
3422
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2293
3423
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
2294
|
-
const filePath =
|
|
2295
|
-
await
|
|
3424
|
+
const filePath = path7.resolve(directory, filename);
|
|
3425
|
+
await mkdir2(path7.dirname(filePath), { recursive: true });
|
|
2296
3426
|
const payload = {
|
|
2297
3427
|
eval_id: evalCase.id,
|
|
2298
3428
|
request: promptInputs.request,
|
|
@@ -2309,7 +3439,7 @@ function sanitizeFilename(value) {
|
|
|
2309
3439
|
return sanitized.length > 0 ? sanitized : randomUUID2();
|
|
2310
3440
|
}
|
|
2311
3441
|
async function invokeProvider(provider, options) {
|
|
2312
|
-
const { evalCase,
|
|
3442
|
+
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2313
3443
|
const controller = new AbortController();
|
|
2314
3444
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2315
3445
|
if (signal) {
|
|
@@ -2320,7 +3450,7 @@ async function invokeProvider(provider, options) {
|
|
|
2320
3450
|
prompt: promptInputs.request,
|
|
2321
3451
|
guidelines: promptInputs.guidelines,
|
|
2322
3452
|
guideline_patterns: evalCase.guideline_patterns,
|
|
2323
|
-
|
|
3453
|
+
inputFiles: evalCase.file_paths,
|
|
2324
3454
|
evalCaseId: evalCase.id,
|
|
2325
3455
|
attempt,
|
|
2326
3456
|
metadata: {
|
|
@@ -2388,25 +3518,20 @@ function createAgentKernel() {
|
|
|
2388
3518
|
return { status: "stub" };
|
|
2389
3519
|
}
|
|
2390
3520
|
export {
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
QualityGrader,
|
|
3521
|
+
CodeEvaluator,
|
|
3522
|
+
LlmJudgeEvaluator,
|
|
2394
3523
|
TEST_MESSAGE_ROLES,
|
|
2395
3524
|
buildDirectoryChain,
|
|
2396
3525
|
buildPromptInputs,
|
|
2397
3526
|
buildSearchRoots,
|
|
2398
|
-
calculateHits,
|
|
2399
|
-
calculateMisses,
|
|
2400
3527
|
createAgentKernel,
|
|
2401
3528
|
createProvider,
|
|
2402
3529
|
ensureVSCodeSubagents,
|
|
2403
|
-
extractAspects,
|
|
2404
3530
|
extractCodeBlocks,
|
|
2405
3531
|
fileExists,
|
|
2406
3532
|
findGitRoot,
|
|
2407
3533
|
getHitCount,
|
|
2408
|
-
|
|
2409
|
-
isGraderKind,
|
|
3534
|
+
isEvaluatorKind,
|
|
2410
3535
|
isGuidelineFile,
|
|
2411
3536
|
isJsonObject,
|
|
2412
3537
|
isJsonValue,
|
|
@@ -2419,7 +3544,6 @@ export {
|
|
|
2419
3544
|
resolveFileReference,
|
|
2420
3545
|
resolveTargetDefinition,
|
|
2421
3546
|
runEvalCase,
|
|
2422
|
-
runEvaluation
|
|
2423
|
-
scoreCandidateResponse
|
|
3547
|
+
runEvaluation
|
|
2424
3548
|
};
|
|
2425
3549
|
//# sourceMappingURL=index.js.map
|