@agentv/core 0.2.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-XXNQA4EW.js → chunk-NL7K4CAK.js} +5 -1
- package/dist/chunk-NL7K4CAK.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +186 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +183 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1701 -324
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +121 -63
- package/dist/index.d.ts +121 -63
- package/dist/index.js +1710 -327
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-XXNQA4EW.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
fileExists,
|
|
6
6
|
findGitRoot,
|
|
7
7
|
resolveFileReference
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-NL7K4CAK.js";
|
|
9
9
|
|
|
10
10
|
// src/evaluation/types.ts
|
|
11
11
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -48,11 +48,10 @@ function isTestMessage(value) {
|
|
|
48
48
|
}
|
|
49
49
|
return candidate.content.every(isJsonObject);
|
|
50
50
|
}
|
|
51
|
-
var
|
|
52
|
-
var
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return typeof value === "string" && GRADER_KIND_SET.has(value);
|
|
51
|
+
var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
|
|
52
|
+
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
53
|
+
function isEvaluatorKind(value) {
|
|
54
|
+
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
56
55
|
}
|
|
57
56
|
function getHitCount(result) {
|
|
58
57
|
return result.hits.length;
|
|
@@ -160,7 +159,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
160
159
|
if (!Array.isArray(rawTestcases)) {
|
|
161
160
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
162
161
|
}
|
|
163
|
-
const
|
|
162
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
164
163
|
const results = [];
|
|
165
164
|
for (const rawEvalcase of rawTestcases) {
|
|
166
165
|
if (!isJsonObject(rawEvalcase)) {
|
|
@@ -283,7 +282,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
283
282
|
const assistantContent = assistantMessages[0]?.content;
|
|
284
283
|
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
285
284
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
286
|
-
const
|
|
285
|
+
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
286
|
+
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
287
287
|
const userFilePaths = [];
|
|
288
288
|
for (const segment of userSegments) {
|
|
289
289
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -306,7 +306,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
306
306
|
file_paths: allFilePaths,
|
|
307
307
|
code_snippets: codeSnippets,
|
|
308
308
|
outcome,
|
|
309
|
-
|
|
309
|
+
evaluator: testCaseEvaluatorKind,
|
|
310
|
+
evaluators
|
|
310
311
|
};
|
|
311
312
|
if (verbose) {
|
|
312
313
|
console.log(`
|
|
@@ -467,14 +468,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
467
468
|
}
|
|
468
469
|
return parts.join(" ");
|
|
469
470
|
}
|
|
470
|
-
function
|
|
471
|
+
async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
472
|
+
const execution = rawEvalCase.execution;
|
|
473
|
+
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
|
|
474
|
+
if (candidateEvaluators === void 0) {
|
|
475
|
+
return void 0;
|
|
476
|
+
}
|
|
477
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
478
|
+
logWarning(`Skipping evaluators for '${evalId}': expected array`);
|
|
479
|
+
return void 0;
|
|
480
|
+
}
|
|
481
|
+
const evaluators = [];
|
|
482
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
483
|
+
if (!isJsonObject(rawEvaluator)) {
|
|
484
|
+
logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
485
|
+
continue;
|
|
486
|
+
}
|
|
487
|
+
const name = asString(rawEvaluator.name);
|
|
488
|
+
const typeValue = rawEvaluator.type;
|
|
489
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
490
|
+
logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
491
|
+
continue;
|
|
492
|
+
}
|
|
493
|
+
if (typeValue === "code") {
|
|
494
|
+
const script = asString(rawEvaluator.script);
|
|
495
|
+
if (!script) {
|
|
496
|
+
logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
497
|
+
continue;
|
|
498
|
+
}
|
|
499
|
+
const cwd = asString(rawEvaluator.cwd);
|
|
500
|
+
let resolvedCwd;
|
|
501
|
+
if (cwd) {
|
|
502
|
+
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
503
|
+
if (resolved.resolvedPath) {
|
|
504
|
+
resolvedCwd = path.resolve(resolved.resolvedPath);
|
|
505
|
+
} else {
|
|
506
|
+
logWarning(
|
|
507
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
508
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
evaluators.push({
|
|
513
|
+
name,
|
|
514
|
+
type: "code",
|
|
515
|
+
script,
|
|
516
|
+
cwd,
|
|
517
|
+
resolvedCwd
|
|
518
|
+
});
|
|
519
|
+
continue;
|
|
520
|
+
}
|
|
521
|
+
const prompt = asString(rawEvaluator.prompt);
|
|
522
|
+
let promptPath;
|
|
523
|
+
if (prompt) {
|
|
524
|
+
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
525
|
+
if (resolved.resolvedPath) {
|
|
526
|
+
promptPath = path.resolve(resolved.resolvedPath);
|
|
527
|
+
} else {
|
|
528
|
+
logWarning(
|
|
529
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
530
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
531
|
+
);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
const model = asString(rawEvaluator.model);
|
|
535
|
+
evaluators.push({
|
|
536
|
+
name,
|
|
537
|
+
type: "llm_judge",
|
|
538
|
+
prompt,
|
|
539
|
+
promptPath,
|
|
540
|
+
model
|
|
541
|
+
});
|
|
542
|
+
}
|
|
543
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
544
|
+
}
|
|
545
|
+
function coerceEvaluator(candidate, contextId) {
|
|
471
546
|
if (typeof candidate !== "string") {
|
|
472
547
|
return void 0;
|
|
473
548
|
}
|
|
474
|
-
if (
|
|
549
|
+
if (isEvaluatorKind(candidate)) {
|
|
475
550
|
return candidate;
|
|
476
551
|
}
|
|
477
|
-
logWarning(`Unknown
|
|
552
|
+
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
478
553
|
return void 0;
|
|
479
554
|
}
|
|
480
555
|
function logWarning(message, details) {
|
|
@@ -670,6 +745,214 @@ var GeminiProvider = class {
|
|
|
670
745
|
}
|
|
671
746
|
};
|
|
672
747
|
|
|
748
|
+
// src/evaluation/providers/cli.ts
|
|
749
|
+
import { exec as execWithCallback } from "node:child_process";
|
|
750
|
+
import path2 from "node:path";
|
|
751
|
+
import { promisify } from "node:util";
|
|
752
|
+
var execAsync = promisify(execWithCallback);
|
|
753
|
+
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
754
|
+
async function defaultCommandRunner(command, options) {
|
|
755
|
+
const execOptions = {
|
|
756
|
+
cwd: options.cwd,
|
|
757
|
+
env: options.env,
|
|
758
|
+
timeout: options.timeoutMs,
|
|
759
|
+
signal: options.signal,
|
|
760
|
+
maxBuffer: DEFAULT_MAX_BUFFER,
|
|
761
|
+
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
762
|
+
};
|
|
763
|
+
try {
|
|
764
|
+
const { stdout, stderr } = await execAsync(command, execOptions);
|
|
765
|
+
return {
|
|
766
|
+
stdout,
|
|
767
|
+
stderr,
|
|
768
|
+
exitCode: 0,
|
|
769
|
+
failed: false,
|
|
770
|
+
timedOut: false,
|
|
771
|
+
signal: null
|
|
772
|
+
};
|
|
773
|
+
} catch (error) {
|
|
774
|
+
const execError = error;
|
|
775
|
+
return {
|
|
776
|
+
stdout: execError.stdout ?? "",
|
|
777
|
+
stderr: execError.stderr ?? "",
|
|
778
|
+
exitCode: typeof execError.code === "number" ? execError.code : null,
|
|
779
|
+
failed: true,
|
|
780
|
+
timedOut: execError.timedOut === true || execError.killed === true,
|
|
781
|
+
signal: execError.signal ?? null
|
|
782
|
+
};
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
var CliProvider = class {
|
|
786
|
+
id;
|
|
787
|
+
kind = "cli";
|
|
788
|
+
targetName;
|
|
789
|
+
supportsBatch = false;
|
|
790
|
+
config;
|
|
791
|
+
runCommand;
|
|
792
|
+
healthcheckPromise;
|
|
793
|
+
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
794
|
+
this.targetName = targetName;
|
|
795
|
+
this.id = `cli:${targetName}`;
|
|
796
|
+
this.config = config;
|
|
797
|
+
this.runCommand = runner;
|
|
798
|
+
}
|
|
799
|
+
async invoke(request) {
|
|
800
|
+
if (request.signal?.aborted) {
|
|
801
|
+
throw new Error("CLI provider request was aborted before execution");
|
|
802
|
+
}
|
|
803
|
+
await this.ensureHealthy(request.signal);
|
|
804
|
+
const templateValues = buildTemplateValues(request, this.config);
|
|
805
|
+
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
806
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
807
|
+
const result = await this.runCommand(renderedCommand, {
|
|
808
|
+
cwd: this.config.cwd,
|
|
809
|
+
env,
|
|
810
|
+
timeoutMs: this.config.timeoutMs,
|
|
811
|
+
signal: request.signal
|
|
812
|
+
});
|
|
813
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
814
|
+
if (request.signal?.aborted) {
|
|
815
|
+
throw new Error("CLI provider request was aborted");
|
|
816
|
+
}
|
|
817
|
+
if (result.timedOut) {
|
|
818
|
+
throw new Error(
|
|
819
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
820
|
+
);
|
|
821
|
+
}
|
|
822
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
823
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
824
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
825
|
+
throw new Error(message);
|
|
826
|
+
}
|
|
827
|
+
return {
|
|
828
|
+
text: result.stdout,
|
|
829
|
+
raw: {
|
|
830
|
+
command: renderedCommand,
|
|
831
|
+
stderr: result.stderr,
|
|
832
|
+
exitCode: result.exitCode ?? 0,
|
|
833
|
+
cwd: this.config.cwd
|
|
834
|
+
}
|
|
835
|
+
};
|
|
836
|
+
}
|
|
837
|
+
async ensureHealthy(signal) {
|
|
838
|
+
if (!this.config.healthcheck) {
|
|
839
|
+
return;
|
|
840
|
+
}
|
|
841
|
+
if (!this.healthcheckPromise) {
|
|
842
|
+
this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
|
|
843
|
+
}
|
|
844
|
+
return this.healthcheckPromise;
|
|
845
|
+
}
|
|
846
|
+
async runHealthcheck(healthcheck, signal) {
|
|
847
|
+
if (!healthcheck) {
|
|
848
|
+
return;
|
|
849
|
+
}
|
|
850
|
+
const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
|
|
851
|
+
if (healthcheck.type === "http") {
|
|
852
|
+
const controller = new AbortController();
|
|
853
|
+
const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
|
|
854
|
+
signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
855
|
+
try {
|
|
856
|
+
const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
|
|
857
|
+
if (!response.ok) {
|
|
858
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}`);
|
|
859
|
+
}
|
|
860
|
+
} catch (error) {
|
|
861
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
862
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
|
|
863
|
+
} finally {
|
|
864
|
+
if (timer !== void 0) {
|
|
865
|
+
clearTimeout(timer);
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
return;
|
|
869
|
+
}
|
|
870
|
+
const renderedCommand = renderTemplate(
|
|
871
|
+
healthcheck.commandTemplate,
|
|
872
|
+
buildTemplateValues(
|
|
873
|
+
{
|
|
874
|
+
prompt: "",
|
|
875
|
+
guidelines: "",
|
|
876
|
+
inputFiles: [],
|
|
877
|
+
evalCaseId: "",
|
|
878
|
+
attempt: 0
|
|
879
|
+
},
|
|
880
|
+
this.config
|
|
881
|
+
)
|
|
882
|
+
);
|
|
883
|
+
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
884
|
+
const result = await this.runCommand(renderedCommand, {
|
|
885
|
+
cwd: healthcheck.cwd ?? this.config.cwd,
|
|
886
|
+
env,
|
|
887
|
+
timeoutMs,
|
|
888
|
+
signal
|
|
889
|
+
});
|
|
890
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
891
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
892
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
893
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
|
|
894
|
+
throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
};
|
|
898
|
+
function buildTemplateValues(request, config) {
|
|
899
|
+
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
900
|
+
return {
|
|
901
|
+
PROMPT: shellEscape(request.prompt ?? ""),
|
|
902
|
+
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
903
|
+
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
904
|
+
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
905
|
+
FILES: formatFileList(inputFiles, config.filesFormat)
|
|
906
|
+
};
|
|
907
|
+
}
|
|
908
|
+
function normalizeInputFiles(inputFiles) {
|
|
909
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
910
|
+
return void 0;
|
|
911
|
+
}
|
|
912
|
+
const unique = /* @__PURE__ */ new Map();
|
|
913
|
+
for (const inputFile of inputFiles) {
|
|
914
|
+
const absolutePath = path2.resolve(inputFile);
|
|
915
|
+
if (!unique.has(absolutePath)) {
|
|
916
|
+
unique.set(absolutePath, absolutePath);
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
return Array.from(unique.values());
|
|
920
|
+
}
|
|
921
|
+
function formatFileList(files, template) {
|
|
922
|
+
if (!files || files.length === 0) {
|
|
923
|
+
return "";
|
|
924
|
+
}
|
|
925
|
+
const formatter = template ?? "{path}";
|
|
926
|
+
return files.map((filePath) => {
|
|
927
|
+
const escapedPath = shellEscape(filePath);
|
|
928
|
+
const escapedName = shellEscape(path2.basename(filePath));
|
|
929
|
+
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
930
|
+
}).join(" ");
|
|
931
|
+
}
|
|
932
|
+
function renderTemplate(template, values) {
|
|
933
|
+
return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
|
|
934
|
+
const replacement = values[key];
|
|
935
|
+
return replacement !== void 0 ? replacement : match;
|
|
936
|
+
});
|
|
937
|
+
}
|
|
938
|
+
function shellEscape(value) {
|
|
939
|
+
if (value.length === 0) {
|
|
940
|
+
return "''";
|
|
941
|
+
}
|
|
942
|
+
if (process.platform === "win32") {
|
|
943
|
+
const escaped = value.replace(/"/g, '\\"');
|
|
944
|
+
return `"${escaped}"`;
|
|
945
|
+
}
|
|
946
|
+
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
947
|
+
}
|
|
948
|
+
function formatTimeoutSuffix(timeoutMs) {
|
|
949
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
950
|
+
return "";
|
|
951
|
+
}
|
|
952
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
953
|
+
return ` after ${seconds}s`;
|
|
954
|
+
}
|
|
955
|
+
|
|
673
956
|
// src/evaluation/providers/mock.ts
|
|
674
957
|
var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
|
|
675
958
|
var MockProvider = class {
|
|
@@ -713,6 +996,7 @@ var MockProvider = class {
|
|
|
713
996
|
|
|
714
997
|
// src/evaluation/providers/targets.ts
|
|
715
998
|
import { z } from "zod";
|
|
999
|
+
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
|
|
716
1000
|
var BASE_TARGET_SCHEMA = z.object({
|
|
717
1001
|
name: z.string().min(1, "target name is required"),
|
|
718
1002
|
provider: z.string().min(1, "provider is required"),
|
|
@@ -735,6 +1019,9 @@ function normalizeAzureApiVersion(value) {
|
|
|
735
1019
|
function resolveTargetDefinition(definition, env = process.env) {
|
|
736
1020
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
737
1021
|
const provider = parsed.provider.toLowerCase();
|
|
1022
|
+
const providerBatching = resolveOptionalBoolean(
|
|
1023
|
+
parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
|
|
1024
|
+
);
|
|
738
1025
|
switch (provider) {
|
|
739
1026
|
case "azure":
|
|
740
1027
|
case "azure-openai":
|
|
@@ -743,6 +1030,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
743
1030
|
name: parsed.name,
|
|
744
1031
|
judgeTarget: parsed.judge_target,
|
|
745
1032
|
workers: parsed.workers,
|
|
1033
|
+
providerBatching,
|
|
746
1034
|
config: resolveAzureConfig(parsed, env)
|
|
747
1035
|
};
|
|
748
1036
|
case "anthropic":
|
|
@@ -751,6 +1039,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
751
1039
|
name: parsed.name,
|
|
752
1040
|
judgeTarget: parsed.judge_target,
|
|
753
1041
|
workers: parsed.workers,
|
|
1042
|
+
providerBatching,
|
|
754
1043
|
config: resolveAnthropicConfig(parsed, env)
|
|
755
1044
|
};
|
|
756
1045
|
case "gemini":
|
|
@@ -761,14 +1050,26 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
761
1050
|
name: parsed.name,
|
|
762
1051
|
judgeTarget: parsed.judge_target,
|
|
763
1052
|
workers: parsed.workers,
|
|
1053
|
+
providerBatching,
|
|
764
1054
|
config: resolveGeminiConfig(parsed, env)
|
|
765
1055
|
};
|
|
1056
|
+
case "codex":
|
|
1057
|
+
case "codex-cli":
|
|
1058
|
+
return {
|
|
1059
|
+
kind: "codex",
|
|
1060
|
+
name: parsed.name,
|
|
1061
|
+
judgeTarget: parsed.judge_target,
|
|
1062
|
+
workers: parsed.workers,
|
|
1063
|
+
providerBatching,
|
|
1064
|
+
config: resolveCodexConfig(parsed, env)
|
|
1065
|
+
};
|
|
766
1066
|
case "mock":
|
|
767
1067
|
return {
|
|
768
1068
|
kind: "mock",
|
|
769
1069
|
name: parsed.name,
|
|
770
1070
|
judgeTarget: parsed.judge_target,
|
|
771
1071
|
workers: parsed.workers,
|
|
1072
|
+
providerBatching,
|
|
772
1073
|
config: resolveMockConfig(parsed)
|
|
773
1074
|
};
|
|
774
1075
|
case "vscode":
|
|
@@ -778,8 +1079,18 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
778
1079
|
name: parsed.name,
|
|
779
1080
|
judgeTarget: parsed.judge_target,
|
|
780
1081
|
workers: parsed.workers,
|
|
1082
|
+
providerBatching,
|
|
781
1083
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
782
1084
|
};
|
|
1085
|
+
case "cli":
|
|
1086
|
+
return {
|
|
1087
|
+
kind: "cli",
|
|
1088
|
+
name: parsed.name,
|
|
1089
|
+
judgeTarget: parsed.judge_target,
|
|
1090
|
+
workers: parsed.workers,
|
|
1091
|
+
providerBatching,
|
|
1092
|
+
config: resolveCliConfig(parsed, env)
|
|
1093
|
+
};
|
|
783
1094
|
default:
|
|
784
1095
|
throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
|
|
785
1096
|
}
|
|
@@ -847,6 +1158,29 @@ function resolveGeminiConfig(target, env) {
|
|
|
847
1158
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
|
|
848
1159
|
};
|
|
849
1160
|
}
|
|
1161
|
+
function resolveCodexConfig(target, env) {
|
|
1162
|
+
const settings = target.settings ?? {};
|
|
1163
|
+
const executableSource = settings.executable ?? settings.command ?? settings.binary;
|
|
1164
|
+
const argsSource = settings.args ?? settings.arguments;
|
|
1165
|
+
const cwdSource = settings.cwd;
|
|
1166
|
+
const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
|
|
1167
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
1168
|
+
allowLiteral: true,
|
|
1169
|
+
optionalEnv: true
|
|
1170
|
+
}) ?? "codex";
|
|
1171
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
|
|
1172
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
|
|
1173
|
+
allowLiteral: true,
|
|
1174
|
+
optionalEnv: true
|
|
1175
|
+
});
|
|
1176
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
|
|
1177
|
+
return {
|
|
1178
|
+
executable,
|
|
1179
|
+
args,
|
|
1180
|
+
cwd,
|
|
1181
|
+
timeoutMs
|
|
1182
|
+
};
|
|
1183
|
+
}
|
|
850
1184
|
function resolveMockConfig(target) {
|
|
851
1185
|
const settings = target.settings ?? {};
|
|
852
1186
|
const response = typeof settings.response === "string" ? settings.response : void 0;
|
|
@@ -876,6 +1210,125 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
876
1210
|
workspaceTemplate
|
|
877
1211
|
};
|
|
878
1212
|
}
|
|
1213
|
+
function resolveCliConfig(target, env) {
|
|
1214
|
+
const settings = target.settings ?? {};
|
|
1215
|
+
const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
|
|
1216
|
+
const filesFormat = resolveOptionalLiteralString(
|
|
1217
|
+
settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
|
|
1218
|
+
);
|
|
1219
|
+
const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
|
|
1220
|
+
allowLiteral: true,
|
|
1221
|
+
optionalEnv: true
|
|
1222
|
+
});
|
|
1223
|
+
const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
|
|
1224
|
+
const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
|
|
1225
|
+
const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
|
|
1226
|
+
const commandTemplate = resolveString(
|
|
1227
|
+
commandTemplateSource,
|
|
1228
|
+
env,
|
|
1229
|
+
`${target.name} CLI command template`,
|
|
1230
|
+
true
|
|
1231
|
+
);
|
|
1232
|
+
assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
|
|
1233
|
+
return {
|
|
1234
|
+
commandTemplate,
|
|
1235
|
+
filesFormat,
|
|
1236
|
+
cwd,
|
|
1237
|
+
env: envOverrides,
|
|
1238
|
+
timeoutMs,
|
|
1239
|
+
healthcheck
|
|
1240
|
+
};
|
|
1241
|
+
}
|
|
1242
|
+
function resolveEnvOverrides(source, env, targetName) {
|
|
1243
|
+
if (source === void 0 || source === null) {
|
|
1244
|
+
return void 0;
|
|
1245
|
+
}
|
|
1246
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
1247
|
+
throw new Error(`${targetName} env overrides must be an object map of strings`);
|
|
1248
|
+
}
|
|
1249
|
+
const entries = Object.entries(source);
|
|
1250
|
+
const resolved = {};
|
|
1251
|
+
for (const [key, value] of entries) {
|
|
1252
|
+
if (typeof value !== "string") {
|
|
1253
|
+
throw new Error(`${targetName} env override '${key}' must be a string`);
|
|
1254
|
+
}
|
|
1255
|
+
const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
|
|
1256
|
+
resolved[key] = resolvedValue;
|
|
1257
|
+
}
|
|
1258
|
+
return Object.keys(resolved).length > 0 ? resolved : void 0;
|
|
1259
|
+
}
|
|
1260
|
+
function resolveTimeoutMs(source, description) {
|
|
1261
|
+
const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
|
|
1262
|
+
if (seconds === void 0) {
|
|
1263
|
+
return void 0;
|
|
1264
|
+
}
|
|
1265
|
+
if (seconds <= 0) {
|
|
1266
|
+
throw new Error(`${description} must be greater than zero seconds`);
|
|
1267
|
+
}
|
|
1268
|
+
return Math.floor(seconds * 1e3);
|
|
1269
|
+
}
|
|
1270
|
+
function resolveCliHealthcheck(source, env, targetName) {
|
|
1271
|
+
if (source === void 0 || source === null) {
|
|
1272
|
+
return void 0;
|
|
1273
|
+
}
|
|
1274
|
+
if (typeof source !== "object" || Array.isArray(source)) {
|
|
1275
|
+
throw new Error(`${targetName} healthcheck must be an object`);
|
|
1276
|
+
}
|
|
1277
|
+
const candidate = source;
|
|
1278
|
+
const type = candidate.type;
|
|
1279
|
+
const timeoutMs = resolveTimeoutMs(
|
|
1280
|
+
candidate.timeout_seconds ?? candidate.timeoutSeconds,
|
|
1281
|
+
`${targetName} healthcheck timeout`
|
|
1282
|
+
);
|
|
1283
|
+
if (type === "http") {
|
|
1284
|
+
const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
|
|
1285
|
+
return {
|
|
1286
|
+
type: "http",
|
|
1287
|
+
url,
|
|
1288
|
+
timeoutMs
|
|
1289
|
+
};
|
|
1290
|
+
}
|
|
1291
|
+
if (type === "command") {
|
|
1292
|
+
const commandTemplate = resolveString(
|
|
1293
|
+
candidate.command_template ?? candidate.commandTemplate,
|
|
1294
|
+
env,
|
|
1295
|
+
`${targetName} healthcheck command template`,
|
|
1296
|
+
true
|
|
1297
|
+
);
|
|
1298
|
+
assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
|
|
1299
|
+
const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
|
|
1300
|
+
allowLiteral: true,
|
|
1301
|
+
optionalEnv: true
|
|
1302
|
+
});
|
|
1303
|
+
return {
|
|
1304
|
+
type: "command",
|
|
1305
|
+
commandTemplate,
|
|
1306
|
+
timeoutMs,
|
|
1307
|
+
cwd
|
|
1308
|
+
};
|
|
1309
|
+
}
|
|
1310
|
+
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
1311
|
+
}
|
|
1312
|
+
function assertSupportedCliPlaceholders(template, description) {
|
|
1313
|
+
const placeholders = extractCliPlaceholders(template);
|
|
1314
|
+
for (const placeholder of placeholders) {
|
|
1315
|
+
if (!CLI_PLACEHOLDERS.has(placeholder)) {
|
|
1316
|
+
throw new Error(
|
|
1317
|
+
`${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
|
|
1318
|
+
);
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
}
|
|
1322
|
+
function extractCliPlaceholders(template) {
|
|
1323
|
+
const matches = template.matchAll(/\{([A-Z_]+)\}/g);
|
|
1324
|
+
const results = [];
|
|
1325
|
+
for (const match of matches) {
|
|
1326
|
+
if (match[1]) {
|
|
1327
|
+
results.push(match[1]);
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
return results;
|
|
1331
|
+
}
|
|
879
1332
|
function resolveString(source, env, description, allowLiteral = false) {
|
|
880
1333
|
const value = resolveOptionalString(source, env, description, {
|
|
881
1334
|
allowLiteral,
|
|
@@ -906,11 +1359,14 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
906
1359
|
}
|
|
907
1360
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
908
1361
|
const optionalEnv = options?.optionalEnv ?? false;
|
|
909
|
-
|
|
1362
|
+
const looksLikeEnv = isLikelyEnvReference(trimmed);
|
|
1363
|
+
if (looksLikeEnv) {
|
|
910
1364
|
if (optionalEnv) {
|
|
911
1365
|
return void 0;
|
|
912
1366
|
}
|
|
913
|
-
|
|
1367
|
+
if (!allowLiteral) {
|
|
1368
|
+
throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
|
|
1369
|
+
}
|
|
914
1370
|
}
|
|
915
1371
|
return trimmed;
|
|
916
1372
|
}
|
|
@@ -960,15 +1416,48 @@ function resolveOptionalBoolean(source) {
|
|
|
960
1416
|
function isLikelyEnvReference(value) {
|
|
961
1417
|
return /^[A-Z0-9_]+$/.test(value);
|
|
962
1418
|
}
|
|
1419
|
+
function resolveOptionalStringArray(source, env, description) {
|
|
1420
|
+
if (source === void 0 || source === null) {
|
|
1421
|
+
return void 0;
|
|
1422
|
+
}
|
|
1423
|
+
if (!Array.isArray(source)) {
|
|
1424
|
+
throw new Error(`${description} must be an array of strings`);
|
|
1425
|
+
}
|
|
1426
|
+
if (source.length === 0) {
|
|
1427
|
+
return void 0;
|
|
1428
|
+
}
|
|
1429
|
+
const resolved = [];
|
|
1430
|
+
for (let i = 0; i < source.length; i++) {
|
|
1431
|
+
const item = source[i];
|
|
1432
|
+
if (typeof item !== "string") {
|
|
1433
|
+
throw new Error(`${description}[${i}] must be a string`);
|
|
1434
|
+
}
|
|
1435
|
+
const trimmed = item.trim();
|
|
1436
|
+
if (trimmed.length === 0) {
|
|
1437
|
+
throw new Error(`${description}[${i}] cannot be empty`);
|
|
1438
|
+
}
|
|
1439
|
+
const envValue = env[trimmed];
|
|
1440
|
+
if (envValue !== void 0) {
|
|
1441
|
+
if (envValue.trim().length === 0) {
|
|
1442
|
+
throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
|
|
1443
|
+
}
|
|
1444
|
+
resolved.push(envValue);
|
|
1445
|
+
} else {
|
|
1446
|
+
resolved.push(trimmed);
|
|
1447
|
+
}
|
|
1448
|
+
}
|
|
1449
|
+
return resolved.length > 0 ? resolved : void 0;
|
|
1450
|
+
}
|
|
963
1451
|
|
|
964
1452
|
// src/evaluation/providers/vscode.ts
|
|
965
1453
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
966
|
-
import
|
|
967
|
-
import { dispatchAgentSession, getSubagentRoot, provisionSubagents } from "subagent";
|
|
1454
|
+
import path3 from "node:path";
|
|
1455
|
+
import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
|
|
968
1456
|
var VSCodeProvider = class {
|
|
969
1457
|
id;
|
|
970
1458
|
kind;
|
|
971
1459
|
targetName;
|
|
1460
|
+
supportsBatch = true;
|
|
972
1461
|
config;
|
|
973
1462
|
constructor(targetName, config, kind) {
|
|
974
1463
|
this.id = `${kind}:${targetName}`;
|
|
@@ -980,12 +1469,11 @@ var VSCodeProvider = class {
|
|
|
980
1469
|
if (request.signal?.aborted) {
|
|
981
1470
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
982
1471
|
}
|
|
983
|
-
const
|
|
984
|
-
const promptContent = buildPromptDocument(request,
|
|
1472
|
+
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
1473
|
+
const promptContent = buildPromptDocument(request, inputFiles, request.guideline_patterns);
|
|
985
1474
|
const session = await dispatchAgentSession({
|
|
986
1475
|
userQuery: promptContent,
|
|
987
|
-
|
|
988
|
-
extraAttachments: attachments,
|
|
1476
|
+
extraAttachments: inputFiles,
|
|
989
1477
|
wait: this.config.waitForResponse,
|
|
990
1478
|
dryRun: this.config.dryRun,
|
|
991
1479
|
vscodeCmd: this.config.command,
|
|
@@ -1002,7 +1490,7 @@ var VSCodeProvider = class {
|
|
|
1002
1490
|
text: "",
|
|
1003
1491
|
raw: {
|
|
1004
1492
|
session,
|
|
1005
|
-
|
|
1493
|
+
inputFiles
|
|
1006
1494
|
}
|
|
1007
1495
|
};
|
|
1008
1496
|
}
|
|
@@ -1011,42 +1499,106 @@ var VSCodeProvider = class {
|
|
|
1011
1499
|
text: responseText,
|
|
1012
1500
|
raw: {
|
|
1013
1501
|
session,
|
|
1014
|
-
|
|
1502
|
+
inputFiles
|
|
1015
1503
|
}
|
|
1016
1504
|
};
|
|
1017
1505
|
}
|
|
1506
|
+
async invokeBatch(requests) {
|
|
1507
|
+
if (requests.length === 0) {
|
|
1508
|
+
return [];
|
|
1509
|
+
}
|
|
1510
|
+
const normalizedRequests = requests.map((req) => ({
|
|
1511
|
+
request: req,
|
|
1512
|
+
inputFiles: normalizeAttachments(req.inputFiles)
|
|
1513
|
+
}));
|
|
1514
|
+
const combinedInputFiles = mergeAttachments(
|
|
1515
|
+
normalizedRequests.map(({ inputFiles }) => inputFiles)
|
|
1516
|
+
);
|
|
1517
|
+
const userQueries = normalizedRequests.map(
|
|
1518
|
+
({ request, inputFiles }) => buildPromptDocument(request, inputFiles, request.guideline_patterns)
|
|
1519
|
+
);
|
|
1520
|
+
const session = await dispatchBatchAgent({
|
|
1521
|
+
userQueries,
|
|
1522
|
+
extraAttachments: combinedInputFiles,
|
|
1523
|
+
wait: this.config.waitForResponse,
|
|
1524
|
+
dryRun: this.config.dryRun,
|
|
1525
|
+
vscodeCmd: this.config.command,
|
|
1526
|
+
subagentRoot: this.config.subagentRoot,
|
|
1527
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1528
|
+
silent: true
|
|
1529
|
+
});
|
|
1530
|
+
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
1531
|
+
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
1532
|
+
throw new Error(failure);
|
|
1533
|
+
}
|
|
1534
|
+
if (this.config.dryRun) {
|
|
1535
|
+
return normalizedRequests.map(({ inputFiles }) => ({
|
|
1536
|
+
text: "",
|
|
1537
|
+
raw: {
|
|
1538
|
+
session,
|
|
1539
|
+
inputFiles,
|
|
1540
|
+
allInputFiles: combinedInputFiles
|
|
1541
|
+
}
|
|
1542
|
+
}));
|
|
1543
|
+
}
|
|
1544
|
+
if (session.responseFiles.length !== requests.length) {
|
|
1545
|
+
throw new Error(
|
|
1546
|
+
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
1547
|
+
);
|
|
1548
|
+
}
|
|
1549
|
+
const responses = [];
|
|
1550
|
+
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
1551
|
+
const responseText = await readFile2(responseFile, "utf8");
|
|
1552
|
+
responses.push({
|
|
1553
|
+
text: responseText,
|
|
1554
|
+
raw: {
|
|
1555
|
+
session,
|
|
1556
|
+
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
1557
|
+
allInputFiles: combinedInputFiles,
|
|
1558
|
+
responseFile
|
|
1559
|
+
}
|
|
1560
|
+
});
|
|
1561
|
+
}
|
|
1562
|
+
return responses;
|
|
1563
|
+
}
|
|
1018
1564
|
};
|
|
1019
1565
|
function buildPromptDocument(request, attachments, guidelinePatterns) {
|
|
1020
1566
|
const parts = [];
|
|
1021
1567
|
const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
|
|
1022
|
-
|
|
1023
|
-
|
|
1568
|
+
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
1569
|
+
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
1570
|
+
(file) => !guidelineFiles.includes(file)
|
|
1571
|
+
);
|
|
1572
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
|
|
1573
|
+
if (prereadBlock.length > 0) {
|
|
1574
|
+
parts.push("\n", prereadBlock);
|
|
1024
1575
|
}
|
|
1025
1576
|
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1026
1577
|
return parts.join("\n").trim();
|
|
1027
1578
|
}
|
|
1028
|
-
function buildMandatoryPrereadBlock(guidelineFiles) {
|
|
1029
|
-
if (guidelineFiles.length === 0) {
|
|
1579
|
+
function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
|
|
1580
|
+
if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
|
|
1030
1581
|
return "";
|
|
1031
1582
|
}
|
|
1032
|
-
const
|
|
1033
|
-
|
|
1034
|
-
for (const absolutePath of guidelineFiles) {
|
|
1035
|
-
counter += 1;
|
|
1036
|
-
const fileName = path2.basename(absolutePath);
|
|
1583
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1584
|
+
const fileName = path3.basename(absolutePath);
|
|
1037
1585
|
const fileUri = pathToFileUri(absolutePath);
|
|
1038
|
-
|
|
1039
|
-
}
|
|
1040
|
-
const
|
|
1041
|
-
|
|
1042
|
-
`Read all guideline files:
|
|
1043
|
-
${
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1586
|
+
return `* [${fileName}](${fileUri})`;
|
|
1587
|
+
});
|
|
1588
|
+
const sections = [];
|
|
1589
|
+
if (guidelineFiles.length > 0) {
|
|
1590
|
+
sections.push(`Read all guideline files:
|
|
1591
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1592
|
+
}
|
|
1593
|
+
if (attachmentFiles.length > 0) {
|
|
1594
|
+
sections.push(`Read all attachment files:
|
|
1595
|
+
${buildList(attachmentFiles).join("\n")}.`);
|
|
1596
|
+
}
|
|
1597
|
+
sections.push(
|
|
1598
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1599
|
+
"Then apply system_instructions on the user query below."
|
|
1600
|
+
);
|
|
1601
|
+
return sections.join("\n");
|
|
1050
1602
|
}
|
|
1051
1603
|
function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
1052
1604
|
if (!attachments || attachments.length === 0) {
|
|
@@ -1054,8 +1606,8 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
|
1054
1606
|
}
|
|
1055
1607
|
const unique = /* @__PURE__ */ new Map();
|
|
1056
1608
|
for (const attachment of attachments) {
|
|
1057
|
-
const absolutePath =
|
|
1058
|
-
const normalized = absolutePath.split(
|
|
1609
|
+
const absolutePath = path3.resolve(attachment);
|
|
1610
|
+
const normalized = absolutePath.split(path3.sep).join("/");
|
|
1059
1611
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1060
1612
|
if (!unique.has(absolutePath)) {
|
|
1061
1613
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1064,8 +1616,21 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
|
1064
1616
|
}
|
|
1065
1617
|
return Array.from(unique.values());
|
|
1066
1618
|
}
|
|
1619
|
+
function collectAttachmentFiles(attachments) {
|
|
1620
|
+
if (!attachments || attachments.length === 0) {
|
|
1621
|
+
return [];
|
|
1622
|
+
}
|
|
1623
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1624
|
+
for (const attachment of attachments) {
|
|
1625
|
+
const absolutePath = path3.resolve(attachment);
|
|
1626
|
+
if (!unique.has(absolutePath)) {
|
|
1627
|
+
unique.set(absolutePath, absolutePath);
|
|
1628
|
+
}
|
|
1629
|
+
}
|
|
1630
|
+
return Array.from(unique.values());
|
|
1631
|
+
}
|
|
1067
1632
|
function pathToFileUri(filePath) {
|
|
1068
|
-
const absolutePath =
|
|
1633
|
+
const absolutePath = path3.isAbsolute(filePath) ? filePath : path3.resolve(filePath);
|
|
1069
1634
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1070
1635
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1071
1636
|
return `file:///${normalizedPath}`;
|
|
@@ -1078,10 +1643,20 @@ function normalizeAttachments(attachments) {
|
|
|
1078
1643
|
}
|
|
1079
1644
|
const deduped = /* @__PURE__ */ new Set();
|
|
1080
1645
|
for (const attachment of attachments) {
|
|
1081
|
-
deduped.add(
|
|
1646
|
+
deduped.add(path3.resolve(attachment));
|
|
1082
1647
|
}
|
|
1083
1648
|
return Array.from(deduped);
|
|
1084
1649
|
}
|
|
1650
|
+
function mergeAttachments(all) {
|
|
1651
|
+
const deduped = /* @__PURE__ */ new Set();
|
|
1652
|
+
for (const list of all) {
|
|
1653
|
+
if (!list) continue;
|
|
1654
|
+
for (const inputFile of list) {
|
|
1655
|
+
deduped.add(path3.resolve(inputFile));
|
|
1656
|
+
}
|
|
1657
|
+
}
|
|
1658
|
+
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
1659
|
+
}
|
|
1085
1660
|
async function ensureVSCodeSubagents(options) {
|
|
1086
1661
|
const { kind, count, verbose = false } = options;
|
|
1087
1662
|
const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
|
|
@@ -1095,36 +1670,612 @@ async function ensureVSCodeSubagents(options) {
|
|
|
1095
1670
|
subagents: count,
|
|
1096
1671
|
dryRun: false
|
|
1097
1672
|
});
|
|
1098
|
-
if (verbose) {
|
|
1099
|
-
if (result.created.length > 0) {
|
|
1100
|
-
console.log(`Created ${result.created.length} new subagent(s)`);
|
|
1673
|
+
if (verbose) {
|
|
1674
|
+
if (result.created.length > 0) {
|
|
1675
|
+
console.log(`Created ${result.created.length} new subagent(s)`);
|
|
1676
|
+
}
|
|
1677
|
+
if (result.skippedExisting.length > 0) {
|
|
1678
|
+
console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
|
|
1679
|
+
}
|
|
1680
|
+
console.log(`
|
|
1681
|
+
total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
|
|
1682
|
+
}
|
|
1683
|
+
return {
|
|
1684
|
+
provisioned: true,
|
|
1685
|
+
message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
|
|
1686
|
+
};
|
|
1687
|
+
} catch (error) {
|
|
1688
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1689
|
+
if (verbose) {
|
|
1690
|
+
console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
|
|
1691
|
+
}
|
|
1692
|
+
return {
|
|
1693
|
+
provisioned: false,
|
|
1694
|
+
message: `Provisioning failed: ${errorMessage}`
|
|
1695
|
+
};
|
|
1696
|
+
}
|
|
1697
|
+
}
|
|
1698
|
+
|
|
1699
|
+
// src/evaluation/providers/codex.ts
|
|
1700
|
+
import { exec as execCallback, spawn } from "node:child_process";
|
|
1701
|
+
import { constants as constants2 } from "node:fs";
|
|
1702
|
+
import { access as access2, copyFile, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
|
|
1703
|
+
import { tmpdir } from "node:os";
|
|
1704
|
+
import path5 from "node:path";
|
|
1705
|
+
import { promisify as promisify2 } from "node:util";
|
|
1706
|
+
|
|
1707
|
+
// src/evaluation/providers/preread.ts
|
|
1708
|
+
import path4 from "node:path";
|
|
1709
|
+
function buildPromptDocument2(request, inputFiles, options) {
|
|
1710
|
+
const parts = [];
|
|
1711
|
+
const guidelineFiles = collectGuidelineFiles2(
|
|
1712
|
+
inputFiles,
|
|
1713
|
+
options?.guidelinePatterns ?? request.guideline_patterns,
|
|
1714
|
+
options?.guidelineOverrides
|
|
1715
|
+
);
|
|
1716
|
+
const inputFilesList = collectInputFiles(inputFiles);
|
|
1717
|
+
const nonGuidelineInputFiles = inputFilesList.filter(
|
|
1718
|
+
(file) => !guidelineFiles.includes(file)
|
|
1719
|
+
);
|
|
1720
|
+
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineInputFiles);
|
|
1721
|
+
if (prereadBlock.length > 0) {
|
|
1722
|
+
parts.push("\n", prereadBlock);
|
|
1723
|
+
}
|
|
1724
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1725
|
+
return parts.join("\n").trim();
|
|
1726
|
+
}
|
|
1727
|
+
function normalizeInputFiles2(inputFiles) {
|
|
1728
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1729
|
+
return void 0;
|
|
1730
|
+
}
|
|
1731
|
+
const deduped = /* @__PURE__ */ new Map();
|
|
1732
|
+
for (const inputFile of inputFiles) {
|
|
1733
|
+
const absolutePath = path4.resolve(inputFile);
|
|
1734
|
+
if (!deduped.has(absolutePath)) {
|
|
1735
|
+
deduped.set(absolutePath, absolutePath);
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1738
|
+
return Array.from(deduped.values());
|
|
1739
|
+
}
|
|
1740
|
+
function collectGuidelineFiles2(inputFiles, guidelinePatterns, overrides) {
|
|
1741
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1742
|
+
return [];
|
|
1743
|
+
}
|
|
1744
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1745
|
+
for (const inputFile of inputFiles) {
|
|
1746
|
+
const absolutePath = path4.resolve(inputFile);
|
|
1747
|
+
if (overrides?.has(absolutePath)) {
|
|
1748
|
+
if (!unique.has(absolutePath)) {
|
|
1749
|
+
unique.set(absolutePath, absolutePath);
|
|
1750
|
+
}
|
|
1751
|
+
continue;
|
|
1752
|
+
}
|
|
1753
|
+
const normalized = absolutePath.split(path4.sep).join("/");
|
|
1754
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1755
|
+
if (!unique.has(absolutePath)) {
|
|
1756
|
+
unique.set(absolutePath, absolutePath);
|
|
1757
|
+
}
|
|
1758
|
+
}
|
|
1759
|
+
}
|
|
1760
|
+
return Array.from(unique.values());
|
|
1761
|
+
}
|
|
1762
|
+
function collectInputFiles(inputFiles) {
|
|
1763
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1764
|
+
return [];
|
|
1765
|
+
}
|
|
1766
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1767
|
+
for (const inputFile of inputFiles) {
|
|
1768
|
+
const absolutePath = path4.resolve(inputFile);
|
|
1769
|
+
if (!unique.has(absolutePath)) {
|
|
1770
|
+
unique.set(absolutePath, absolutePath);
|
|
1771
|
+
}
|
|
1772
|
+
}
|
|
1773
|
+
return Array.from(unique.values());
|
|
1774
|
+
}
|
|
1775
|
+
function buildMandatoryPrereadBlock2(guidelineFiles, inputFiles) {
|
|
1776
|
+
if (guidelineFiles.length === 0 && inputFiles.length === 0) {
|
|
1777
|
+
return "";
|
|
1778
|
+
}
|
|
1779
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1780
|
+
const fileName = path4.basename(absolutePath);
|
|
1781
|
+
const fileUri = pathToFileUri2(absolutePath);
|
|
1782
|
+
return `* [${fileName}](${fileUri})`;
|
|
1783
|
+
});
|
|
1784
|
+
const sections = [];
|
|
1785
|
+
if (guidelineFiles.length > 0) {
|
|
1786
|
+
sections.push(`Read all guideline files:
|
|
1787
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1788
|
+
}
|
|
1789
|
+
if (inputFiles.length > 0) {
|
|
1790
|
+
sections.push(`Read all input files:
|
|
1791
|
+
${buildList(inputFiles).join("\n")}.`);
|
|
1792
|
+
}
|
|
1793
|
+
sections.push(
|
|
1794
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1795
|
+
"Then apply system_instructions on the user query below."
|
|
1796
|
+
);
|
|
1797
|
+
return sections.join("\n");
|
|
1798
|
+
}
|
|
1799
|
+
function pathToFileUri2(filePath) {
|
|
1800
|
+
const absolutePath = path4.isAbsolute(filePath) ? filePath : path4.resolve(filePath);
|
|
1801
|
+
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1802
|
+
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1803
|
+
return `file:///${normalizedPath}`;
|
|
1804
|
+
}
|
|
1805
|
+
return `file://${normalizedPath}`;
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1808
|
+
// src/evaluation/providers/codex.ts
|
|
1809
|
+
var execAsync2 = promisify2(execCallback);
|
|
1810
|
+
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
1811
|
+
var PROMPT_FILENAME = "prompt.md";
|
|
1812
|
+
var FILES_DIR = "files";
|
|
1813
|
+
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
1814
|
+
var CodexProvider = class {
|
|
1815
|
+
id;
|
|
1816
|
+
kind = "codex";
|
|
1817
|
+
targetName;
|
|
1818
|
+
supportsBatch = false;
|
|
1819
|
+
config;
|
|
1820
|
+
runCodex;
|
|
1821
|
+
environmentCheck;
|
|
1822
|
+
resolvedExecutable;
|
|
1823
|
+
constructor(targetName, config, runner = defaultCodexRunner) {
|
|
1824
|
+
this.id = `codex:${targetName}`;
|
|
1825
|
+
this.targetName = targetName;
|
|
1826
|
+
this.config = config;
|
|
1827
|
+
this.runCodex = runner;
|
|
1828
|
+
}
|
|
1829
|
+
async invoke(request) {
|
|
1830
|
+
if (request.signal?.aborted) {
|
|
1831
|
+
throw new Error("Codex provider request was aborted before execution");
|
|
1832
|
+
}
|
|
1833
|
+
await this.ensureEnvironmentReady();
|
|
1834
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
1835
|
+
const originalGuidelines = new Set(
|
|
1836
|
+
collectGuidelineFiles2(inputFiles, request.guideline_patterns).map((file) => path5.resolve(file))
|
|
1837
|
+
);
|
|
1838
|
+
const workspaceRoot = await this.createWorkspace();
|
|
1839
|
+
try {
|
|
1840
|
+
const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
|
|
1841
|
+
inputFiles,
|
|
1842
|
+
workspaceRoot,
|
|
1843
|
+
originalGuidelines
|
|
1844
|
+
);
|
|
1845
|
+
const promptContent = buildPromptDocument2(request, mirroredInputFiles, {
|
|
1846
|
+
guidelinePatterns: request.guideline_patterns,
|
|
1847
|
+
guidelineOverrides: guidelineMirrors
|
|
1848
|
+
});
|
|
1849
|
+
const promptFile = path5.join(workspaceRoot, PROMPT_FILENAME);
|
|
1850
|
+
await writeFile(promptFile, promptContent, "utf8");
|
|
1851
|
+
const args = this.buildCodexArgs();
|
|
1852
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
1853
|
+
const result = await this.executeCodex(args, cwd, promptContent, request.signal);
|
|
1854
|
+
if (result.timedOut) {
|
|
1855
|
+
throw new Error(
|
|
1856
|
+
`Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
1857
|
+
);
|
|
1858
|
+
}
|
|
1859
|
+
if (result.exitCode !== 0) {
|
|
1860
|
+
const detail = pickDetail(result.stderr, result.stdout);
|
|
1861
|
+
const prefix = `Codex CLI exited with code ${result.exitCode}`;
|
|
1862
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
1863
|
+
}
|
|
1864
|
+
const parsed = parseCodexJson(result.stdout);
|
|
1865
|
+
const assistantText = extractAssistantText(parsed);
|
|
1866
|
+
return {
|
|
1867
|
+
text: assistantText,
|
|
1868
|
+
raw: {
|
|
1869
|
+
response: parsed,
|
|
1870
|
+
stdout: result.stdout,
|
|
1871
|
+
stderr: result.stderr,
|
|
1872
|
+
exitCode: result.exitCode,
|
|
1873
|
+
args,
|
|
1874
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1875
|
+
promptFile,
|
|
1876
|
+
workspace: workspaceRoot,
|
|
1877
|
+
inputFiles: mirroredInputFiles
|
|
1878
|
+
}
|
|
1879
|
+
};
|
|
1880
|
+
} finally {
|
|
1881
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
1882
|
+
}
|
|
1883
|
+
}
|
|
1884
|
+
async ensureEnvironmentReady() {
|
|
1885
|
+
if (!this.environmentCheck) {
|
|
1886
|
+
this.environmentCheck = this.validateEnvironment();
|
|
1887
|
+
}
|
|
1888
|
+
await this.environmentCheck;
|
|
1889
|
+
}
|
|
1890
|
+
async validateEnvironment() {
|
|
1891
|
+
this.resolvedExecutable = await locateExecutable(this.config.executable);
|
|
1892
|
+
}
|
|
1893
|
+
resolveCwd(workspaceRoot) {
|
|
1894
|
+
if (!this.config.cwd) {
|
|
1895
|
+
return workspaceRoot;
|
|
1896
|
+
}
|
|
1897
|
+
return path5.resolve(this.config.cwd);
|
|
1898
|
+
}
|
|
1899
|
+
buildCodexArgs() {
|
|
1900
|
+
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
1901
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
1902
|
+
args.push(...this.config.args);
|
|
1903
|
+
}
|
|
1904
|
+
args.push("-");
|
|
1905
|
+
return args;
|
|
1906
|
+
}
|
|
1907
|
+
async executeCodex(args, cwd, promptContent, signal) {
|
|
1908
|
+
try {
|
|
1909
|
+
return await this.runCodex({
|
|
1910
|
+
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1911
|
+
args,
|
|
1912
|
+
cwd,
|
|
1913
|
+
prompt: promptContent,
|
|
1914
|
+
timeoutMs: this.config.timeoutMs,
|
|
1915
|
+
env: process.env,
|
|
1916
|
+
signal
|
|
1917
|
+
});
|
|
1918
|
+
} catch (error) {
|
|
1919
|
+
const err = error;
|
|
1920
|
+
if (err.code === "ENOENT") {
|
|
1921
|
+
throw new Error(
|
|
1922
|
+
`Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
1923
|
+
);
|
|
1924
|
+
}
|
|
1925
|
+
throw error;
|
|
1926
|
+
}
|
|
1927
|
+
}
|
|
1928
|
+
async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
|
|
1929
|
+
if (!inputFiles || inputFiles.length === 0) {
|
|
1930
|
+
return {
|
|
1931
|
+
mirroredInputFiles: void 0,
|
|
1932
|
+
guidelineMirrors: /* @__PURE__ */ new Set()
|
|
1933
|
+
};
|
|
1934
|
+
}
|
|
1935
|
+
const filesRoot = path5.join(workspaceRoot, FILES_DIR);
|
|
1936
|
+
await mkdir(filesRoot, { recursive: true });
|
|
1937
|
+
const mirrored = [];
|
|
1938
|
+
const guidelineMirrors = /* @__PURE__ */ new Set();
|
|
1939
|
+
const nameCounts = /* @__PURE__ */ new Map();
|
|
1940
|
+
for (const inputFile of inputFiles) {
|
|
1941
|
+
const absoluteSource = path5.resolve(inputFile);
|
|
1942
|
+
const baseName = path5.basename(absoluteSource);
|
|
1943
|
+
const count = nameCounts.get(baseName) ?? 0;
|
|
1944
|
+
nameCounts.set(baseName, count + 1);
|
|
1945
|
+
const finalName = count === 0 ? baseName : `${baseName}.${count}`;
|
|
1946
|
+
const destination = path5.join(filesRoot, finalName);
|
|
1947
|
+
await copyFile(absoluteSource, destination);
|
|
1948
|
+
const resolvedDestination = path5.resolve(destination);
|
|
1949
|
+
mirrored.push(resolvedDestination);
|
|
1950
|
+
if (guidelineOriginals.has(absoluteSource)) {
|
|
1951
|
+
guidelineMirrors.add(resolvedDestination);
|
|
1952
|
+
}
|
|
1953
|
+
}
|
|
1954
|
+
return {
|
|
1955
|
+
mirroredInputFiles: mirrored,
|
|
1956
|
+
guidelineMirrors
|
|
1957
|
+
};
|
|
1958
|
+
}
|
|
1959
|
+
async createWorkspace() {
|
|
1960
|
+
return await mkdtemp(path5.join(tmpdir(), WORKSPACE_PREFIX));
|
|
1961
|
+
}
|
|
1962
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
1963
|
+
try {
|
|
1964
|
+
await rm(workspaceRoot, { recursive: true, force: true });
|
|
1965
|
+
} catch {
|
|
1966
|
+
}
|
|
1967
|
+
}
|
|
1968
|
+
};
|
|
1969
|
+
async function locateExecutable(candidate) {
|
|
1970
|
+
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1971
|
+
if (includesPathSeparator) {
|
|
1972
|
+
const resolved = path5.isAbsolute(candidate) ? candidate : path5.resolve(candidate);
|
|
1973
|
+
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
1974
|
+
await access2(executablePath, constants2.F_OK);
|
|
1975
|
+
return executablePath;
|
|
1976
|
+
}
|
|
1977
|
+
const locator = process.platform === "win32" ? "where" : "which";
|
|
1978
|
+
try {
|
|
1979
|
+
const { stdout } = await execAsync2(`${locator} ${candidate}`);
|
|
1980
|
+
const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
1981
|
+
const preferred = selectExecutableCandidate(lines);
|
|
1982
|
+
if (preferred) {
|
|
1983
|
+
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
1984
|
+
await access2(executablePath, constants2.F_OK);
|
|
1985
|
+
return executablePath;
|
|
1986
|
+
}
|
|
1987
|
+
} catch {
|
|
1988
|
+
}
|
|
1989
|
+
throw new Error(`Codex executable '${candidate}' was not found on PATH`);
|
|
1990
|
+
}
|
|
1991
|
+
function selectExecutableCandidate(candidates) {
|
|
1992
|
+
if (candidates.length === 0) {
|
|
1993
|
+
return void 0;
|
|
1994
|
+
}
|
|
1995
|
+
if (process.platform !== "win32") {
|
|
1996
|
+
return candidates[0];
|
|
1997
|
+
}
|
|
1998
|
+
const extensions = getWindowsExecutableExtensions();
|
|
1999
|
+
for (const ext of extensions) {
|
|
2000
|
+
const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
|
|
2001
|
+
if (match) {
|
|
2002
|
+
return match;
|
|
2003
|
+
}
|
|
2004
|
+
}
|
|
2005
|
+
return candidates[0];
|
|
2006
|
+
}
|
|
2007
|
+
async function ensureWindowsExecutableVariant(candidate) {
|
|
2008
|
+
if (process.platform !== "win32") {
|
|
2009
|
+
return candidate;
|
|
2010
|
+
}
|
|
2011
|
+
if (hasExecutableExtension(candidate)) {
|
|
2012
|
+
return candidate;
|
|
2013
|
+
}
|
|
2014
|
+
const extensions = getWindowsExecutableExtensions();
|
|
2015
|
+
for (const ext of extensions) {
|
|
2016
|
+
const withExtension = `${candidate}${ext}`;
|
|
2017
|
+
try {
|
|
2018
|
+
await access2(withExtension, constants2.F_OK);
|
|
2019
|
+
return withExtension;
|
|
2020
|
+
} catch {
|
|
2021
|
+
}
|
|
2022
|
+
}
|
|
2023
|
+
return candidate;
|
|
2024
|
+
}
|
|
2025
|
+
function hasExecutableExtension(candidate) {
|
|
2026
|
+
const lower = candidate.toLowerCase();
|
|
2027
|
+
return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
|
|
2028
|
+
}
|
|
2029
|
+
var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
|
|
2030
|
+
function getWindowsExecutableExtensions() {
|
|
2031
|
+
if (process.platform !== "win32") {
|
|
2032
|
+
return [];
|
|
2033
|
+
}
|
|
2034
|
+
const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
|
|
2035
|
+
return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
|
|
2036
|
+
}
|
|
2037
|
+
function parseCodexJson(output) {
|
|
2038
|
+
const trimmed = output.trim();
|
|
2039
|
+
if (trimmed.length === 0) {
|
|
2040
|
+
throw new Error("Codex CLI produced no output in --json mode");
|
|
2041
|
+
}
|
|
2042
|
+
try {
|
|
2043
|
+
return JSON.parse(trimmed);
|
|
2044
|
+
} catch {
|
|
2045
|
+
const lineObjects = parseJsonLines(trimmed);
|
|
2046
|
+
if (lineObjects) {
|
|
2047
|
+
return lineObjects;
|
|
2048
|
+
}
|
|
2049
|
+
const lastBrace = trimmed.lastIndexOf("{");
|
|
2050
|
+
if (lastBrace >= 0) {
|
|
2051
|
+
const candidate = trimmed.slice(lastBrace);
|
|
2052
|
+
try {
|
|
2053
|
+
return JSON.parse(candidate);
|
|
2054
|
+
} catch {
|
|
2055
|
+
}
|
|
2056
|
+
}
|
|
2057
|
+
const preview = trimmed.slice(0, 200);
|
|
2058
|
+
throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
|
|
2059
|
+
}
|
|
2060
|
+
}
|
|
2061
|
+
function extractAssistantText(parsed) {
|
|
2062
|
+
if (Array.isArray(parsed)) {
|
|
2063
|
+
const text = extractFromEventStream(parsed);
|
|
2064
|
+
if (text) {
|
|
2065
|
+
return text;
|
|
2066
|
+
}
|
|
2067
|
+
}
|
|
2068
|
+
if (!parsed || typeof parsed !== "object") {
|
|
2069
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
2070
|
+
}
|
|
2071
|
+
const record = parsed;
|
|
2072
|
+
const eventText = extractFromEvent(record);
|
|
2073
|
+
if (eventText) {
|
|
2074
|
+
return eventText;
|
|
2075
|
+
}
|
|
2076
|
+
const messages = Array.isArray(record.messages) ? record.messages : void 0;
|
|
2077
|
+
if (messages) {
|
|
2078
|
+
for (let index = messages.length - 1; index >= 0; index -= 1) {
|
|
2079
|
+
const entry = messages[index];
|
|
2080
|
+
if (!entry || typeof entry !== "object") {
|
|
2081
|
+
continue;
|
|
2082
|
+
}
|
|
2083
|
+
const role = entry.role;
|
|
2084
|
+
if (role !== "assistant") {
|
|
2085
|
+
continue;
|
|
2086
|
+
}
|
|
2087
|
+
const content = entry.content;
|
|
2088
|
+
const flattened = flattenContent(content);
|
|
2089
|
+
if (flattened) {
|
|
2090
|
+
return flattened;
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2093
|
+
}
|
|
2094
|
+
const response = record.response;
|
|
2095
|
+
if (response && typeof response === "object") {
|
|
2096
|
+
const content = response.content;
|
|
2097
|
+
const flattened = flattenContent(content);
|
|
2098
|
+
if (flattened) {
|
|
2099
|
+
return flattened;
|
|
2100
|
+
}
|
|
2101
|
+
}
|
|
2102
|
+
const output = record.output;
|
|
2103
|
+
const flattenedOutput = flattenContent(output);
|
|
2104
|
+
if (flattenedOutput) {
|
|
2105
|
+
return flattenedOutput;
|
|
2106
|
+
}
|
|
2107
|
+
throw new Error("Codex CLI JSON response did not include an assistant message");
|
|
2108
|
+
}
|
|
2109
|
+
function extractFromEventStream(events) {
|
|
2110
|
+
for (let index = events.length - 1; index >= 0; index -= 1) {
|
|
2111
|
+
const candidate = events[index];
|
|
2112
|
+
const text = extractFromEvent(candidate);
|
|
2113
|
+
if (text) {
|
|
2114
|
+
return text;
|
|
2115
|
+
}
|
|
2116
|
+
}
|
|
2117
|
+
return void 0;
|
|
2118
|
+
}
|
|
2119
|
+
function extractFromEvent(event) {
|
|
2120
|
+
if (!event || typeof event !== "object") {
|
|
2121
|
+
return void 0;
|
|
2122
|
+
}
|
|
2123
|
+
const record = event;
|
|
2124
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
2125
|
+
if (type === JSONL_TYPE_ITEM_COMPLETED) {
|
|
2126
|
+
const item = record.item;
|
|
2127
|
+
const text = extractFromItem(item);
|
|
2128
|
+
if (text) {
|
|
2129
|
+
return text;
|
|
2130
|
+
}
|
|
2131
|
+
}
|
|
2132
|
+
const output = record.output ?? record.content;
|
|
2133
|
+
const flattened = flattenContent(output);
|
|
2134
|
+
if (flattened) {
|
|
2135
|
+
return flattened;
|
|
2136
|
+
}
|
|
2137
|
+
return void 0;
|
|
2138
|
+
}
|
|
2139
|
+
function extractFromItem(item) {
|
|
2140
|
+
if (!item || typeof item !== "object") {
|
|
2141
|
+
return void 0;
|
|
2142
|
+
}
|
|
2143
|
+
const record = item;
|
|
2144
|
+
const itemType = typeof record.type === "string" ? record.type : void 0;
|
|
2145
|
+
if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
|
|
2146
|
+
const text = flattenContent(record.text ?? record.content ?? record.output);
|
|
2147
|
+
if (text) {
|
|
2148
|
+
return text;
|
|
2149
|
+
}
|
|
2150
|
+
}
|
|
2151
|
+
return void 0;
|
|
2152
|
+
}
|
|
2153
|
+
function flattenContent(value) {
|
|
2154
|
+
if (typeof value === "string") {
|
|
2155
|
+
return value;
|
|
2156
|
+
}
|
|
2157
|
+
if (Array.isArray(value)) {
|
|
2158
|
+
const parts = value.map((segment) => {
|
|
2159
|
+
if (typeof segment === "string") {
|
|
2160
|
+
return segment;
|
|
2161
|
+
}
|
|
2162
|
+
if (segment && typeof segment === "object" && "text" in segment) {
|
|
2163
|
+
const text = segment.text;
|
|
2164
|
+
return typeof text === "string" ? text : void 0;
|
|
2165
|
+
}
|
|
2166
|
+
return void 0;
|
|
2167
|
+
}).filter((part) => typeof part === "string" && part.length > 0);
|
|
2168
|
+
return parts.length > 0 ? parts.join(" \n") : void 0;
|
|
2169
|
+
}
|
|
2170
|
+
if (value && typeof value === "object" && "text" in value) {
|
|
2171
|
+
const text = value.text;
|
|
2172
|
+
return typeof text === "string" ? text : void 0;
|
|
2173
|
+
}
|
|
2174
|
+
return void 0;
|
|
2175
|
+
}
|
|
2176
|
+
function parseJsonLines(output) {
|
|
2177
|
+
const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
|
|
2178
|
+
if (lines.length <= 1) {
|
|
2179
|
+
return void 0;
|
|
2180
|
+
}
|
|
2181
|
+
const parsed = [];
|
|
2182
|
+
for (const line of lines) {
|
|
2183
|
+
try {
|
|
2184
|
+
parsed.push(JSON.parse(line));
|
|
2185
|
+
} catch {
|
|
2186
|
+
return void 0;
|
|
2187
|
+
}
|
|
2188
|
+
}
|
|
2189
|
+
return parsed;
|
|
2190
|
+
}
|
|
2191
|
+
function pickDetail(stderr, stdout) {
|
|
2192
|
+
const errorText = stderr.trim();
|
|
2193
|
+
if (errorText.length > 0) {
|
|
2194
|
+
return errorText;
|
|
2195
|
+
}
|
|
2196
|
+
const stdoutText = stdout.trim();
|
|
2197
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
2198
|
+
}
|
|
2199
|
+
function formatTimeoutSuffix2(timeoutMs) {
|
|
2200
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
2201
|
+
return "";
|
|
2202
|
+
}
|
|
2203
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
2204
|
+
return ` after ${seconds}s`;
|
|
2205
|
+
}
|
|
2206
|
+
async function defaultCodexRunner(options) {
|
|
2207
|
+
return await new Promise((resolve, reject) => {
|
|
2208
|
+
const child = spawn(options.executable, options.args, {
|
|
2209
|
+
cwd: options.cwd,
|
|
2210
|
+
env: options.env,
|
|
2211
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
2212
|
+
shell: shouldShellExecute(options.executable)
|
|
2213
|
+
});
|
|
2214
|
+
let stdout = "";
|
|
2215
|
+
let stderr = "";
|
|
2216
|
+
let timedOut = false;
|
|
2217
|
+
const onAbort = () => {
|
|
2218
|
+
child.kill("SIGTERM");
|
|
2219
|
+
};
|
|
2220
|
+
if (options.signal) {
|
|
2221
|
+
if (options.signal.aborted) {
|
|
2222
|
+
onAbort();
|
|
2223
|
+
} else {
|
|
2224
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
2225
|
+
}
|
|
2226
|
+
}
|
|
2227
|
+
let timeoutHandle;
|
|
2228
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
2229
|
+
timeoutHandle = setTimeout(() => {
|
|
2230
|
+
timedOut = true;
|
|
2231
|
+
child.kill("SIGTERM");
|
|
2232
|
+
}, options.timeoutMs);
|
|
2233
|
+
timeoutHandle.unref?.();
|
|
2234
|
+
}
|
|
2235
|
+
child.stdout.setEncoding("utf8");
|
|
2236
|
+
child.stdout.on("data", (chunk) => {
|
|
2237
|
+
stdout += chunk;
|
|
2238
|
+
});
|
|
2239
|
+
child.stderr.setEncoding("utf8");
|
|
2240
|
+
child.stderr.on("data", (chunk) => {
|
|
2241
|
+
stderr += chunk;
|
|
2242
|
+
});
|
|
2243
|
+
child.stdin.end(options.prompt);
|
|
2244
|
+
const cleanup = () => {
|
|
2245
|
+
if (timeoutHandle) {
|
|
2246
|
+
clearTimeout(timeoutHandle);
|
|
1101
2247
|
}
|
|
1102
|
-
if (
|
|
1103
|
-
|
|
2248
|
+
if (options.signal) {
|
|
2249
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
1104
2250
|
}
|
|
1105
|
-
console.log(`
|
|
1106
|
-
total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
|
|
1107
|
-
}
|
|
1108
|
-
return {
|
|
1109
|
-
provisioned: true,
|
|
1110
|
-
message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
|
|
1111
|
-
};
|
|
1112
|
-
} catch (error) {
|
|
1113
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1114
|
-
if (verbose) {
|
|
1115
|
-
console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
|
|
1116
|
-
}
|
|
1117
|
-
return {
|
|
1118
|
-
provisioned: false,
|
|
1119
|
-
message: `Provisioning failed: ${errorMessage}`
|
|
1120
2251
|
};
|
|
2252
|
+
child.on("error", (error) => {
|
|
2253
|
+
cleanup();
|
|
2254
|
+
reject(error);
|
|
2255
|
+
});
|
|
2256
|
+
child.on("close", (code) => {
|
|
2257
|
+
cleanup();
|
|
2258
|
+
resolve({
|
|
2259
|
+
stdout,
|
|
2260
|
+
stderr,
|
|
2261
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
2262
|
+
timedOut
|
|
2263
|
+
});
|
|
2264
|
+
});
|
|
2265
|
+
});
|
|
2266
|
+
}
|
|
2267
|
+
function shouldShellExecute(executable) {
|
|
2268
|
+
if (process.platform !== "win32") {
|
|
2269
|
+
return false;
|
|
1121
2270
|
}
|
|
2271
|
+
const lower = executable.toLowerCase();
|
|
2272
|
+
return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
|
|
1122
2273
|
}
|
|
1123
2274
|
|
|
1124
2275
|
// src/evaluation/providers/targets-file.ts
|
|
1125
|
-
import { constants as
|
|
1126
|
-
import { access as
|
|
1127
|
-
import
|
|
2276
|
+
import { constants as constants3 } from "node:fs";
|
|
2277
|
+
import { access as access3, readFile as readFile3 } from "node:fs/promises";
|
|
2278
|
+
import path6 from "node:path";
|
|
1128
2279
|
import { parse as parse2 } from "yaml";
|
|
1129
2280
|
function isRecord(value) {
|
|
1130
2281
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -1180,14 +2331,14 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
1180
2331
|
}
|
|
1181
2332
|
async function fileExists3(filePath) {
|
|
1182
2333
|
try {
|
|
1183
|
-
await
|
|
2334
|
+
await access3(filePath, constants3.F_OK);
|
|
1184
2335
|
return true;
|
|
1185
2336
|
} catch {
|
|
1186
2337
|
return false;
|
|
1187
2338
|
}
|
|
1188
2339
|
}
|
|
1189
2340
|
async function readTargetDefinitions(filePath) {
|
|
1190
|
-
const absolutePath =
|
|
2341
|
+
const absolutePath = path6.resolve(filePath);
|
|
1191
2342
|
if (!await fileExists3(absolutePath)) {
|
|
1192
2343
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
1193
2344
|
}
|
|
@@ -1214,6 +2365,10 @@ function createProvider(target) {
|
|
|
1214
2365
|
return new AnthropicProvider(target.name, target.config);
|
|
1215
2366
|
case "gemini":
|
|
1216
2367
|
return new GeminiProvider(target.name, target.config);
|
|
2368
|
+
case "cli":
|
|
2369
|
+
return new CliProvider(target.name, target.config);
|
|
2370
|
+
case "codex":
|
|
2371
|
+
return new CodexProvider(target.name, target.config);
|
|
1217
2372
|
case "mock":
|
|
1218
2373
|
return new MockProvider(target.name, target.config);
|
|
1219
2374
|
case "vscode":
|
|
@@ -1230,230 +2385,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
1230
2385
|
return createProvider(resolved);
|
|
1231
2386
|
}
|
|
1232
2387
|
|
|
1233
|
-
// src/evaluation/
|
|
1234
|
-
var KEY_TERM_MATCH_THRESHOLD = 0.5;
|
|
1235
|
-
var ACTION_WORDS = /* @__PURE__ */ new Set([
|
|
1236
|
-
"use",
|
|
1237
|
-
"avoid",
|
|
1238
|
-
"prefer",
|
|
1239
|
-
"replace",
|
|
1240
|
-
"consider",
|
|
1241
|
-
"ensure",
|
|
1242
|
-
"remove",
|
|
1243
|
-
"add"
|
|
1244
|
-
]);
|
|
1245
|
-
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
1246
|
-
"the",
|
|
1247
|
-
"a",
|
|
1248
|
-
"an",
|
|
1249
|
-
"and",
|
|
1250
|
-
"or",
|
|
1251
|
-
"but",
|
|
1252
|
-
"in",
|
|
1253
|
-
"on",
|
|
1254
|
-
"at",
|
|
1255
|
-
"to",
|
|
1256
|
-
"for",
|
|
1257
|
-
"of",
|
|
1258
|
-
"with",
|
|
1259
|
-
"by",
|
|
1260
|
-
"is",
|
|
1261
|
-
"are",
|
|
1262
|
-
"was",
|
|
1263
|
-
"were",
|
|
1264
|
-
"be",
|
|
1265
|
-
"been",
|
|
1266
|
-
"being",
|
|
1267
|
-
"have",
|
|
1268
|
-
"has",
|
|
1269
|
-
"had",
|
|
1270
|
-
"do",
|
|
1271
|
-
"does",
|
|
1272
|
-
"did",
|
|
1273
|
-
"will",
|
|
1274
|
-
"would",
|
|
1275
|
-
"could",
|
|
1276
|
-
"should"
|
|
1277
|
-
]);
|
|
1278
|
-
var ERROR_PREFIXES = [
|
|
1279
|
-
"error:",
|
|
1280
|
-
"err:",
|
|
1281
|
-
"vs code command failed",
|
|
1282
|
-
"exception",
|
|
1283
|
-
"traceback",
|
|
1284
|
-
"no response file was generated",
|
|
1285
|
-
"timed out",
|
|
1286
|
-
"cli not found"
|
|
1287
|
-
];
|
|
1288
|
-
function extractAspects(expectedResponse) {
|
|
1289
|
-
const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
|
|
1290
|
-
const aspects = [];
|
|
1291
|
-
for (const line of lines) {
|
|
1292
|
-
if (line.length === 0) {
|
|
1293
|
-
continue;
|
|
1294
|
-
}
|
|
1295
|
-
const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
|
|
1296
|
-
if (bulletMatch) {
|
|
1297
|
-
const normalized = normalizeAspect(bulletMatch[2]);
|
|
1298
|
-
if (normalized.length > 0) {
|
|
1299
|
-
aspects.push(normalized);
|
|
1300
|
-
}
|
|
1301
|
-
continue;
|
|
1302
|
-
}
|
|
1303
|
-
const lowered = line.toLowerCase();
|
|
1304
|
-
if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
|
|
1305
|
-
const normalized = normalizeAspect(line);
|
|
1306
|
-
if (normalized.length > 0) {
|
|
1307
|
-
aspects.push(normalized);
|
|
1308
|
-
}
|
|
1309
|
-
}
|
|
1310
|
-
}
|
|
1311
|
-
return aspects;
|
|
1312
|
-
}
|
|
1313
|
-
function calculateHits(candidateResponse, expectedAspects) {
|
|
1314
|
-
const { normalizedText, words } = normalizeCandidate(candidateResponse);
|
|
1315
|
-
const hits = [];
|
|
1316
|
-
for (const aspect of expectedAspects) {
|
|
1317
|
-
if (matchesAspect(aspect, normalizedText, words)) {
|
|
1318
|
-
hits.push(aspect);
|
|
1319
|
-
}
|
|
1320
|
-
}
|
|
1321
|
-
return hits;
|
|
1322
|
-
}
|
|
1323
|
-
function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
|
|
1324
|
-
const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
|
|
1325
|
-
return expectedAspects.filter((aspect) => !hits.has(aspect));
|
|
1326
|
-
}
|
|
1327
|
-
function scoreCandidateResponse(candidateResponse, expectedAspects) {
|
|
1328
|
-
if (expectedAspects.length === 0) {
|
|
1329
|
-
if (isErrorLike(candidateResponse)) {
|
|
1330
|
-
return {
|
|
1331
|
-
score: 0,
|
|
1332
|
-
hits: [],
|
|
1333
|
-
misses: ["Model produced an error instead of an answer."],
|
|
1334
|
-
hitCount: 0,
|
|
1335
|
-
totalAspects: 0,
|
|
1336
|
-
rawAspects: []
|
|
1337
|
-
};
|
|
1338
|
-
}
|
|
1339
|
-
return {
|
|
1340
|
-
score: 1,
|
|
1341
|
-
hits: [],
|
|
1342
|
-
misses: [],
|
|
1343
|
-
hitCount: 0,
|
|
1344
|
-
totalAspects: 0,
|
|
1345
|
-
rawAspects: []
|
|
1346
|
-
};
|
|
1347
|
-
}
|
|
1348
|
-
const hits = calculateHits(candidateResponse, expectedAspects);
|
|
1349
|
-
const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
|
|
1350
|
-
const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
|
|
1351
|
-
return {
|
|
1352
|
-
score,
|
|
1353
|
-
hits,
|
|
1354
|
-
misses,
|
|
1355
|
-
hitCount: hits.length,
|
|
1356
|
-
totalAspects: expectedAspects.length,
|
|
1357
|
-
rawAspects: expectedAspects
|
|
1358
|
-
};
|
|
1359
|
-
}
|
|
1360
|
-
function isErrorLike(text) {
|
|
1361
|
-
if (!text) {
|
|
1362
|
-
return false;
|
|
1363
|
-
}
|
|
1364
|
-
const lowered = text.trim().toLowerCase();
|
|
1365
|
-
return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
|
|
1366
|
-
}
|
|
1367
|
-
function normalizeAspect(aspect) {
|
|
1368
|
-
const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
1369
|
-
return sanitized;
|
|
1370
|
-
}
|
|
1371
|
-
function normalizeCandidate(candidate) {
|
|
1372
|
-
const lowered = candidate.toLowerCase();
|
|
1373
|
-
const normalizedText = lowered.replace(/[^\w\s]/g, " ");
|
|
1374
|
-
const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
|
|
1375
|
-
return { normalizedText, words };
|
|
1376
|
-
}
|
|
1377
|
-
function matchesAspect(aspect, candidateNormalized, candidateWords) {
|
|
1378
|
-
const keyTerms = extractKeyTerms(aspect);
|
|
1379
|
-
if (keyTerms.length === 0) {
|
|
1380
|
-
return false;
|
|
1381
|
-
}
|
|
1382
|
-
const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
|
|
1383
|
-
const ratio = matches / keyTerms.length;
|
|
1384
|
-
if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
|
|
1385
|
-
return true;
|
|
1386
|
-
}
|
|
1387
|
-
const aspectWords = aspect.split(" ");
|
|
1388
|
-
if (aspectWords.length >= 2) {
|
|
1389
|
-
for (let index = 0; index < aspectWords.length - 1; index += 1) {
|
|
1390
|
-
const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
|
|
1391
|
-
if (candidateNormalized.includes(phrase)) {
|
|
1392
|
-
return true;
|
|
1393
|
-
}
|
|
1394
|
-
}
|
|
1395
|
-
}
|
|
1396
|
-
return false;
|
|
1397
|
-
}
|
|
1398
|
-
function extractKeyTerms(aspect, maxTerms = 5) {
|
|
1399
|
-
const terms = [];
|
|
1400
|
-
const words = aspect.split(" ");
|
|
1401
|
-
for (const word of words) {
|
|
1402
|
-
if (word.length <= 2) {
|
|
1403
|
-
continue;
|
|
1404
|
-
}
|
|
1405
|
-
if (STOP_WORDS.has(word)) {
|
|
1406
|
-
continue;
|
|
1407
|
-
}
|
|
1408
|
-
terms.push(word);
|
|
1409
|
-
if (terms.length >= maxTerms) {
|
|
1410
|
-
break;
|
|
1411
|
-
}
|
|
1412
|
-
}
|
|
1413
|
-
return terms;
|
|
1414
|
-
}
|
|
1415
|
-
|
|
1416
|
-
// src/evaluation/grading.ts
|
|
2388
|
+
// src/evaluation/evaluators.ts
|
|
1417
2389
|
import { randomUUID } from "node:crypto";
|
|
1418
|
-
var
|
|
1419
|
-
kind = "heuristic";
|
|
1420
|
-
grade(context) {
|
|
1421
|
-
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1422
|
-
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1423
|
-
const misses = [...result.misses];
|
|
1424
|
-
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
1425
|
-
const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
|
|
1426
|
-
if (firstLine && !misses.includes(firstLine)) {
|
|
1427
|
-
misses.unshift(firstLine);
|
|
1428
|
-
}
|
|
1429
|
-
}
|
|
1430
|
-
return {
|
|
1431
|
-
score: result.score,
|
|
1432
|
-
hits: result.hits,
|
|
1433
|
-
misses,
|
|
1434
|
-
expectedAspectCount: result.totalAspects,
|
|
1435
|
-
rawAspects: result.rawAspects
|
|
1436
|
-
};
|
|
1437
|
-
}
|
|
1438
|
-
};
|
|
1439
|
-
var QualityGrader = class {
|
|
2390
|
+
var LlmJudgeEvaluator = class {
|
|
1440
2391
|
kind = "llm_judge";
|
|
1441
2392
|
resolveJudgeProvider;
|
|
1442
2393
|
maxOutputTokens;
|
|
1443
2394
|
temperature;
|
|
2395
|
+
customPrompt;
|
|
1444
2396
|
constructor(options) {
|
|
1445
2397
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
1446
2398
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
1447
2399
|
this.temperature = options.temperature;
|
|
2400
|
+
this.customPrompt = options.customPrompt;
|
|
1448
2401
|
}
|
|
1449
|
-
async
|
|
2402
|
+
async evaluate(context) {
|
|
1450
2403
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
1451
2404
|
if (!judgeProvider) {
|
|
1452
2405
|
throw new Error("No judge provider available for LLM grading");
|
|
1453
2406
|
}
|
|
1454
2407
|
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2408
|
+
const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
1455
2409
|
const metadata = {
|
|
1456
|
-
systemPrompt:
|
|
2410
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2411
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1457
2412
|
};
|
|
1458
2413
|
const response = await judgeProvider.invoke({
|
|
1459
2414
|
prompt,
|
|
@@ -1468,12 +2423,13 @@ var QualityGrader = class {
|
|
|
1468
2423
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1469
2424
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
1470
2425
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
1471
|
-
const
|
|
2426
|
+
const evaluatorRawRequest = {
|
|
1472
2427
|
id: randomUUID(),
|
|
1473
2428
|
provider: judgeProvider.id,
|
|
1474
2429
|
prompt,
|
|
1475
|
-
|
|
1476
|
-
|
|
2430
|
+
target: context.target.name,
|
|
2431
|
+
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2432
|
+
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
1477
2433
|
};
|
|
1478
2434
|
return {
|
|
1479
2435
|
score,
|
|
@@ -1481,7 +2437,7 @@ var QualityGrader = class {
|
|
|
1481
2437
|
misses,
|
|
1482
2438
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
1483
2439
|
reasoning,
|
|
1484
|
-
|
|
2440
|
+
evaluatorRawRequest
|
|
1485
2441
|
};
|
|
1486
2442
|
}
|
|
1487
2443
|
};
|
|
@@ -1599,11 +2555,117 @@ function extractJsonBlob(text) {
|
|
|
1599
2555
|
function isNonEmptyString(value) {
|
|
1600
2556
|
return typeof value === "string" && value.trim().length > 0;
|
|
1601
2557
|
}
|
|
2558
|
+
var CodeEvaluator = class {
|
|
2559
|
+
kind = "code";
|
|
2560
|
+
script;
|
|
2561
|
+
cwd;
|
|
2562
|
+
agentTimeoutMs;
|
|
2563
|
+
constructor(options) {
|
|
2564
|
+
this.script = options.script;
|
|
2565
|
+
this.cwd = options.cwd;
|
|
2566
|
+
this.agentTimeoutMs = options.agentTimeoutMs;
|
|
2567
|
+
}
|
|
2568
|
+
async evaluate(context) {
|
|
2569
|
+
const inputPayload = JSON.stringify(
|
|
2570
|
+
{
|
|
2571
|
+
task: context.evalCase.task,
|
|
2572
|
+
outcome: context.evalCase.outcome,
|
|
2573
|
+
expected: context.evalCase.expected_assistant_raw,
|
|
2574
|
+
output: context.candidate,
|
|
2575
|
+
system_message: context.promptInputs.systemMessage ?? "",
|
|
2576
|
+
guideline_paths: context.evalCase.guideline_paths,
|
|
2577
|
+
attachments: context.evalCase.file_paths,
|
|
2578
|
+
user_segments: context.evalCase.user_segments
|
|
2579
|
+
},
|
|
2580
|
+
null,
|
|
2581
|
+
2
|
|
2582
|
+
);
|
|
2583
|
+
try {
|
|
2584
|
+
const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
|
|
2585
|
+
const parsed = parseJsonSafe(stdout);
|
|
2586
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
2587
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
2588
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
2589
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
2590
|
+
return {
|
|
2591
|
+
score,
|
|
2592
|
+
hits,
|
|
2593
|
+
misses,
|
|
2594
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
2595
|
+
reasoning,
|
|
2596
|
+
evaluatorRawRequest: {
|
|
2597
|
+
script: this.script,
|
|
2598
|
+
...this.cwd ? { cwd: this.cwd } : {}
|
|
2599
|
+
}
|
|
2600
|
+
};
|
|
2601
|
+
} catch (error) {
|
|
2602
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2603
|
+
return {
|
|
2604
|
+
score: 0,
|
|
2605
|
+
hits: [],
|
|
2606
|
+
misses: [`Code evaluator failed: ${message}`],
|
|
2607
|
+
expectedAspectCount: 1,
|
|
2608
|
+
reasoning: message,
|
|
2609
|
+
evaluatorRawRequest: {
|
|
2610
|
+
script: this.script,
|
|
2611
|
+
...this.cwd ? { cwd: this.cwd } : {},
|
|
2612
|
+
error: message
|
|
2613
|
+
}
|
|
2614
|
+
};
|
|
2615
|
+
}
|
|
2616
|
+
}
|
|
2617
|
+
};
|
|
2618
|
+
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
2619
|
+
const { spawn: spawn2 } = await import("node:child_process");
|
|
2620
|
+
return await new Promise((resolve, reject) => {
|
|
2621
|
+
const child = spawn2(scriptPath, {
|
|
2622
|
+
shell: true,
|
|
2623
|
+
cwd
|
|
2624
|
+
});
|
|
2625
|
+
let stdout = "";
|
|
2626
|
+
let stderr = "";
|
|
2627
|
+
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
2628
|
+
child.kill();
|
|
2629
|
+
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
2630
|
+
}, agentTimeoutMs) : void 0;
|
|
2631
|
+
child.stdout?.on("data", (data) => {
|
|
2632
|
+
stdout += data.toString();
|
|
2633
|
+
});
|
|
2634
|
+
child.stderr?.on("data", (data) => {
|
|
2635
|
+
stderr += data.toString();
|
|
2636
|
+
});
|
|
2637
|
+
child.on("error", (error) => {
|
|
2638
|
+
if (timeout !== void 0) {
|
|
2639
|
+
clearTimeout(timeout);
|
|
2640
|
+
}
|
|
2641
|
+
reject(error);
|
|
2642
|
+
});
|
|
2643
|
+
child.on("exit", (code) => {
|
|
2644
|
+
if (timeout !== void 0) {
|
|
2645
|
+
clearTimeout(timeout);
|
|
2646
|
+
}
|
|
2647
|
+
if (code && code !== 0 && stderr.length > 0) {
|
|
2648
|
+
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
2649
|
+
return;
|
|
2650
|
+
}
|
|
2651
|
+
resolve(stdout.trim());
|
|
2652
|
+
});
|
|
2653
|
+
child.stdin?.write(input);
|
|
2654
|
+
child.stdin?.end();
|
|
2655
|
+
});
|
|
2656
|
+
}
|
|
2657
|
+
function parseJsonSafe(payload) {
|
|
2658
|
+
try {
|
|
2659
|
+
return JSON.parse(payload);
|
|
2660
|
+
} catch {
|
|
2661
|
+
return void 0;
|
|
2662
|
+
}
|
|
2663
|
+
}
|
|
1602
2664
|
|
|
1603
2665
|
// src/evaluation/orchestrator.ts
|
|
1604
2666
|
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
1605
|
-
import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
|
|
1606
|
-
import
|
|
2667
|
+
import { mkdir as mkdir2, readFile as readFile4, writeFile as writeFile2 } from "node:fs/promises";
|
|
2668
|
+
import path7 from "node:path";
|
|
1607
2669
|
|
|
1608
2670
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
1609
2671
|
var Node = class {
|
|
@@ -1750,7 +2812,7 @@ async function runEvaluation(options) {
|
|
|
1750
2812
|
targets,
|
|
1751
2813
|
env,
|
|
1752
2814
|
providerFactory,
|
|
1753
|
-
|
|
2815
|
+
evaluators,
|
|
1754
2816
|
maxRetries,
|
|
1755
2817
|
agentTimeoutMs,
|
|
1756
2818
|
promptDumpDir,
|
|
@@ -1809,8 +2871,14 @@ async function runEvaluation(options) {
|
|
|
1809
2871
|
}
|
|
1810
2872
|
return getOrCreateProvider(resolvedJudge);
|
|
1811
2873
|
};
|
|
1812
|
-
const
|
|
2874
|
+
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
1813
2875
|
const primaryProvider = getOrCreateProvider(target);
|
|
2876
|
+
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
2877
|
+
if (target.providerBatching && !providerSupportsBatch && verbose) {
|
|
2878
|
+
console.warn(
|
|
2879
|
+
`Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
|
|
2880
|
+
);
|
|
2881
|
+
}
|
|
1814
2882
|
if (onProgress && filteredEvalCases.length > 0) {
|
|
1815
2883
|
for (let i = 0; i < filteredEvalCases.length; i++) {
|
|
1816
2884
|
await onProgress({
|
|
@@ -1820,6 +2888,28 @@ async function runEvaluation(options) {
|
|
|
1820
2888
|
});
|
|
1821
2889
|
}
|
|
1822
2890
|
}
|
|
2891
|
+
if (providerSupportsBatch) {
|
|
2892
|
+
try {
|
|
2893
|
+
return await runBatchEvaluation({
|
|
2894
|
+
evalCases: filteredEvalCases,
|
|
2895
|
+
provider: primaryProvider,
|
|
2896
|
+
target,
|
|
2897
|
+
evaluatorRegistry,
|
|
2898
|
+
promptDumpDir,
|
|
2899
|
+
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
2900
|
+
onProgress,
|
|
2901
|
+
onResult,
|
|
2902
|
+
verbose,
|
|
2903
|
+
resolveJudgeProvider,
|
|
2904
|
+
agentTimeoutMs
|
|
2905
|
+
});
|
|
2906
|
+
} catch (error) {
|
|
2907
|
+
if (verbose) {
|
|
2908
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2909
|
+
console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
|
|
2910
|
+
}
|
|
2911
|
+
}
|
|
2912
|
+
}
|
|
1823
2913
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1824
2914
|
const limit = pLimit(workers);
|
|
1825
2915
|
let nextWorkerId = 1;
|
|
@@ -1842,7 +2932,7 @@ async function runEvaluation(options) {
|
|
|
1842
2932
|
evalCase,
|
|
1843
2933
|
provider: primaryProvider,
|
|
1844
2934
|
target,
|
|
1845
|
-
|
|
2935
|
+
evaluators: evaluatorRegistry,
|
|
1846
2936
|
maxRetries,
|
|
1847
2937
|
agentTimeoutMs,
|
|
1848
2938
|
promptDumpDir,
|
|
@@ -1903,12 +2993,118 @@ async function runEvaluation(options) {
|
|
|
1903
2993
|
}
|
|
1904
2994
|
return results;
|
|
1905
2995
|
}
|
|
2996
|
+
async function runBatchEvaluation(options) {
|
|
2997
|
+
const {
|
|
2998
|
+
evalCases,
|
|
2999
|
+
provider,
|
|
3000
|
+
target,
|
|
3001
|
+
evaluatorRegistry,
|
|
3002
|
+
promptDumpDir,
|
|
3003
|
+
nowFn,
|
|
3004
|
+
onProgress,
|
|
3005
|
+
onResult,
|
|
3006
|
+
resolveJudgeProvider,
|
|
3007
|
+
agentTimeoutMs
|
|
3008
|
+
} = options;
|
|
3009
|
+
const promptInputsList = [];
|
|
3010
|
+
for (const evalCase of evalCases) {
|
|
3011
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
3012
|
+
if (promptDumpDir) {
|
|
3013
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
3014
|
+
}
|
|
3015
|
+
promptInputsList.push(promptInputs);
|
|
3016
|
+
}
|
|
3017
|
+
const batchRequests = evalCases.map((evalCase, index) => {
|
|
3018
|
+
const promptInputs = promptInputsList[index];
|
|
3019
|
+
return {
|
|
3020
|
+
prompt: promptInputs.request,
|
|
3021
|
+
guidelines: promptInputs.guidelines,
|
|
3022
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
3023
|
+
inputFiles: evalCase.file_paths,
|
|
3024
|
+
evalCaseId: evalCase.id,
|
|
3025
|
+
metadata: {
|
|
3026
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
3027
|
+
}
|
|
3028
|
+
};
|
|
3029
|
+
});
|
|
3030
|
+
const batchResponse = await provider.invokeBatch?.(batchRequests);
|
|
3031
|
+
if (!Array.isArray(batchResponse)) {
|
|
3032
|
+
throw new Error("Provider batching failed: invokeBatch did not return an array");
|
|
3033
|
+
}
|
|
3034
|
+
if (batchResponse.length !== evalCases.length) {
|
|
3035
|
+
throw new Error(
|
|
3036
|
+
`Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
|
|
3037
|
+
);
|
|
3038
|
+
}
|
|
3039
|
+
if (onProgress) {
|
|
3040
|
+
const startedAt = Date.now();
|
|
3041
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
3042
|
+
await onProgress({
|
|
3043
|
+
workerId: 1,
|
|
3044
|
+
evalId: evalCases[i].id,
|
|
3045
|
+
status: "running",
|
|
3046
|
+
startedAt
|
|
3047
|
+
});
|
|
3048
|
+
}
|
|
3049
|
+
}
|
|
3050
|
+
const results = [];
|
|
3051
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
3052
|
+
const evalCase = evalCases[i];
|
|
3053
|
+
const promptInputs = promptInputsList[i];
|
|
3054
|
+
const providerResponse = batchResponse[i];
|
|
3055
|
+
let result;
|
|
3056
|
+
try {
|
|
3057
|
+
result = await evaluateCandidate({
|
|
3058
|
+
evalCase,
|
|
3059
|
+
candidate: providerResponse.text ?? "",
|
|
3060
|
+
target,
|
|
3061
|
+
provider,
|
|
3062
|
+
evaluators: evaluatorRegistry,
|
|
3063
|
+
promptInputs,
|
|
3064
|
+
nowFn,
|
|
3065
|
+
attempt: 0,
|
|
3066
|
+
judgeProvider: await resolveJudgeProvider(target),
|
|
3067
|
+
agentTimeoutMs
|
|
3068
|
+
});
|
|
3069
|
+
} catch (error) {
|
|
3070
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3071
|
+
results.push(errorResult);
|
|
3072
|
+
if (onResult) {
|
|
3073
|
+
await onResult(errorResult);
|
|
3074
|
+
}
|
|
3075
|
+
if (onProgress) {
|
|
3076
|
+
await onProgress({
|
|
3077
|
+
workerId: 1,
|
|
3078
|
+
evalId: evalCase.id,
|
|
3079
|
+
status: "failed",
|
|
3080
|
+
completedAt: Date.now(),
|
|
3081
|
+
error: error instanceof Error ? error.message : String(error)
|
|
3082
|
+
});
|
|
3083
|
+
}
|
|
3084
|
+
continue;
|
|
3085
|
+
}
|
|
3086
|
+
results.push(result);
|
|
3087
|
+
if (onResult) {
|
|
3088
|
+
await onResult(result);
|
|
3089
|
+
}
|
|
3090
|
+
if (onProgress) {
|
|
3091
|
+
await onProgress({
|
|
3092
|
+
workerId: 1,
|
|
3093
|
+
evalId: evalCase.id,
|
|
3094
|
+
status: "completed",
|
|
3095
|
+
startedAt: 0,
|
|
3096
|
+
completedAt: Date.now()
|
|
3097
|
+
});
|
|
3098
|
+
}
|
|
3099
|
+
}
|
|
3100
|
+
return results;
|
|
3101
|
+
}
|
|
1906
3102
|
async function runEvalCase(options) {
|
|
1907
3103
|
const {
|
|
1908
3104
|
evalCase,
|
|
1909
3105
|
provider,
|
|
1910
3106
|
target,
|
|
1911
|
-
|
|
3107
|
+
evaluators,
|
|
1912
3108
|
now,
|
|
1913
3109
|
maxRetries,
|
|
1914
3110
|
agentTimeoutMs,
|
|
@@ -1963,27 +3159,49 @@ async function runEvalCase(options) {
|
|
|
1963
3159
|
if (cacheKey && cache && !cachedResponse) {
|
|
1964
3160
|
await cache.set(cacheKey, providerResponse);
|
|
1965
3161
|
}
|
|
1966
|
-
const graderKind = evalCase.grader ?? "heuristic";
|
|
1967
|
-
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
1968
|
-
if (!activeGrader) {
|
|
1969
|
-
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
1970
|
-
}
|
|
1971
|
-
let grade;
|
|
1972
3162
|
try {
|
|
1973
|
-
|
|
1974
|
-
grade = await activeGrader.grade({
|
|
3163
|
+
return await evaluateCandidate({
|
|
1975
3164
|
evalCase,
|
|
1976
3165
|
candidate: providerResponse.text ?? "",
|
|
1977
3166
|
target,
|
|
1978
3167
|
provider,
|
|
1979
|
-
|
|
3168
|
+
evaluators,
|
|
1980
3169
|
promptInputs,
|
|
1981
|
-
|
|
1982
|
-
|
|
3170
|
+
nowFn,
|
|
3171
|
+
attempt,
|
|
3172
|
+
judgeProvider,
|
|
3173
|
+
agentTimeoutMs
|
|
1983
3174
|
});
|
|
1984
3175
|
} catch (error) {
|
|
1985
3176
|
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
1986
3177
|
}
|
|
3178
|
+
}
|
|
3179
|
+
async function evaluateCandidate(options) {
|
|
3180
|
+
const {
|
|
3181
|
+
evalCase,
|
|
3182
|
+
candidate,
|
|
3183
|
+
target,
|
|
3184
|
+
provider,
|
|
3185
|
+
evaluators,
|
|
3186
|
+
promptInputs,
|
|
3187
|
+
nowFn,
|
|
3188
|
+
attempt,
|
|
3189
|
+
judgeProvider,
|
|
3190
|
+
agentTimeoutMs
|
|
3191
|
+
} = options;
|
|
3192
|
+
const gradeTimestamp = nowFn();
|
|
3193
|
+
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
3194
|
+
evalCase,
|
|
3195
|
+
candidate,
|
|
3196
|
+
target,
|
|
3197
|
+
provider,
|
|
3198
|
+
evaluators,
|
|
3199
|
+
attempt,
|
|
3200
|
+
promptInputs,
|
|
3201
|
+
now: gradeTimestamp,
|
|
3202
|
+
judgeProvider,
|
|
3203
|
+
agentTimeoutMs
|
|
3204
|
+
});
|
|
1987
3205
|
const completedAt = nowFn();
|
|
1988
3206
|
const rawRequest = {
|
|
1989
3207
|
request: promptInputs.request,
|
|
@@ -1994,28 +3212,200 @@ async function runEvalCase(options) {
|
|
|
1994
3212
|
return {
|
|
1995
3213
|
eval_id: evalCase.id,
|
|
1996
3214
|
conversation_id: evalCase.conversation_id,
|
|
1997
|
-
score:
|
|
1998
|
-
hits:
|
|
1999
|
-
misses:
|
|
2000
|
-
model_answer:
|
|
2001
|
-
expected_aspect_count:
|
|
3215
|
+
score: score.score,
|
|
3216
|
+
hits: score.hits,
|
|
3217
|
+
misses: score.misses,
|
|
3218
|
+
model_answer: candidate,
|
|
3219
|
+
expected_aspect_count: score.expectedAspectCount,
|
|
2002
3220
|
target: target.name,
|
|
2003
3221
|
timestamp: completedAt.toISOString(),
|
|
2004
|
-
reasoning:
|
|
2005
|
-
raw_aspects:
|
|
3222
|
+
reasoning: score.reasoning,
|
|
3223
|
+
raw_aspects: score.rawAspects,
|
|
2006
3224
|
raw_request: rawRequest,
|
|
2007
|
-
|
|
3225
|
+
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3226
|
+
evaluator_results: evaluatorResults
|
|
2008
3227
|
};
|
|
2009
3228
|
}
|
|
3229
|
+
async function runEvaluatorsForCase(options) {
|
|
3230
|
+
const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
|
|
3231
|
+
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
3232
|
+
return runEvaluatorList({
|
|
3233
|
+
evalCase,
|
|
3234
|
+
evaluators: evalCase.evaluators,
|
|
3235
|
+
candidate,
|
|
3236
|
+
target,
|
|
3237
|
+
provider,
|
|
3238
|
+
evaluatorRegistry: evaluators,
|
|
3239
|
+
attempt,
|
|
3240
|
+
promptInputs,
|
|
3241
|
+
now,
|
|
3242
|
+
judgeProvider,
|
|
3243
|
+
agentTimeoutMs
|
|
3244
|
+
});
|
|
3245
|
+
}
|
|
3246
|
+
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
3247
|
+
const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
|
|
3248
|
+
if (!activeEvaluator) {
|
|
3249
|
+
throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
|
|
3250
|
+
}
|
|
3251
|
+
const score = await activeEvaluator.evaluate({
|
|
3252
|
+
evalCase,
|
|
3253
|
+
candidate,
|
|
3254
|
+
target,
|
|
3255
|
+
provider,
|
|
3256
|
+
attempt,
|
|
3257
|
+
promptInputs,
|
|
3258
|
+
now,
|
|
3259
|
+
judgeProvider
|
|
3260
|
+
});
|
|
3261
|
+
return { score };
|
|
3262
|
+
}
|
|
3263
|
+
async function runEvaluatorList(options) {
|
|
3264
|
+
const {
|
|
3265
|
+
evalCase,
|
|
3266
|
+
evaluators,
|
|
3267
|
+
candidate,
|
|
3268
|
+
target,
|
|
3269
|
+
provider,
|
|
3270
|
+
evaluatorRegistry,
|
|
3271
|
+
attempt,
|
|
3272
|
+
promptInputs,
|
|
3273
|
+
now,
|
|
3274
|
+
judgeProvider,
|
|
3275
|
+
agentTimeoutMs
|
|
3276
|
+
} = options;
|
|
3277
|
+
const scored = [];
|
|
3278
|
+
const evaluatorResults = [];
|
|
3279
|
+
for (const evaluator of evaluators ?? []) {
|
|
3280
|
+
try {
|
|
3281
|
+
if (evaluator.type === "llm_judge") {
|
|
3282
|
+
const score2 = await runLlmJudgeEvaluator({
|
|
3283
|
+
config: evaluator,
|
|
3284
|
+
evalCase,
|
|
3285
|
+
candidate,
|
|
3286
|
+
target,
|
|
3287
|
+
provider,
|
|
3288
|
+
evaluatorRegistry,
|
|
3289
|
+
attempt,
|
|
3290
|
+
promptInputs,
|
|
3291
|
+
now,
|
|
3292
|
+
judgeProvider
|
|
3293
|
+
});
|
|
3294
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3295
|
+
evaluatorResults.push({
|
|
3296
|
+
name: evaluator.name,
|
|
3297
|
+
type: evaluator.type,
|
|
3298
|
+
score: score2.score,
|
|
3299
|
+
hits: score2.hits,
|
|
3300
|
+
misses: score2.misses,
|
|
3301
|
+
reasoning: score2.reasoning,
|
|
3302
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3303
|
+
});
|
|
3304
|
+
continue;
|
|
3305
|
+
}
|
|
3306
|
+
if (evaluator.type === "code") {
|
|
3307
|
+
const codeEvaluator = new CodeEvaluator({
|
|
3308
|
+
script: evaluator.script,
|
|
3309
|
+
cwd: evaluator.resolvedCwd ?? evaluator.cwd,
|
|
3310
|
+
agentTimeoutMs
|
|
3311
|
+
});
|
|
3312
|
+
const score2 = await codeEvaluator.evaluate({
|
|
3313
|
+
evalCase,
|
|
3314
|
+
candidate,
|
|
3315
|
+
target,
|
|
3316
|
+
provider,
|
|
3317
|
+
attempt,
|
|
3318
|
+
promptInputs,
|
|
3319
|
+
now
|
|
3320
|
+
});
|
|
3321
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
3322
|
+
evaluatorResults.push({
|
|
3323
|
+
name: evaluator.name,
|
|
3324
|
+
type: evaluator.type,
|
|
3325
|
+
score: score2.score,
|
|
3326
|
+
hits: score2.hits,
|
|
3327
|
+
misses: score2.misses,
|
|
3328
|
+
reasoning: score2.reasoning,
|
|
3329
|
+
evaluator_raw_request: score2.evaluatorRawRequest
|
|
3330
|
+
});
|
|
3331
|
+
continue;
|
|
3332
|
+
}
|
|
3333
|
+
} catch (error) {
|
|
3334
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3335
|
+
const fallbackScore = {
|
|
3336
|
+
score: 0,
|
|
3337
|
+
hits: [],
|
|
3338
|
+
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
3339
|
+
expectedAspectCount: 1,
|
|
3340
|
+
reasoning: message
|
|
3341
|
+
};
|
|
3342
|
+
scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
|
|
3343
|
+
evaluatorResults.push({
|
|
3344
|
+
name: evaluator.name ?? "unknown",
|
|
3345
|
+
type: evaluator.type ?? "unknown",
|
|
3346
|
+
score: 0,
|
|
3347
|
+
hits: [],
|
|
3348
|
+
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
3349
|
+
reasoning: message
|
|
3350
|
+
});
|
|
3351
|
+
}
|
|
3352
|
+
}
|
|
3353
|
+
const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
|
|
3354
|
+
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
3355
|
+
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
3356
|
+
const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
|
|
3357
|
+
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
3358
|
+
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
3359
|
+
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
3360
|
+
const score = {
|
|
3361
|
+
score: aggregateScore,
|
|
3362
|
+
hits,
|
|
3363
|
+
misses,
|
|
3364
|
+
expectedAspectCount,
|
|
3365
|
+
reasoning,
|
|
3366
|
+
rawAspects: rawAspects.length > 0 ? rawAspects : void 0
|
|
3367
|
+
};
|
|
3368
|
+
return { score, evaluatorResults };
|
|
3369
|
+
}
|
|
3370
|
+
async function runLlmJudgeEvaluator(options) {
|
|
3371
|
+
const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
|
|
3372
|
+
const customPrompt = await resolveCustomPrompt(config);
|
|
3373
|
+
return evaluatorRegistry.llm_judge.evaluate({
|
|
3374
|
+
evalCase,
|
|
3375
|
+
candidate,
|
|
3376
|
+
target,
|
|
3377
|
+
provider,
|
|
3378
|
+
attempt,
|
|
3379
|
+
promptInputs,
|
|
3380
|
+
now,
|
|
3381
|
+
judgeProvider,
|
|
3382
|
+
systemPrompt: customPrompt,
|
|
3383
|
+
evaluator: config,
|
|
3384
|
+
judgeModel: config.model
|
|
3385
|
+
});
|
|
3386
|
+
}
|
|
3387
|
+
async function resolveCustomPrompt(config) {
|
|
3388
|
+
if (config.promptPath) {
|
|
3389
|
+
try {
|
|
3390
|
+
return await readFile4(config.promptPath, "utf8");
|
|
3391
|
+
} catch (error) {
|
|
3392
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3393
|
+
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
3394
|
+
}
|
|
3395
|
+
}
|
|
3396
|
+
return config.prompt;
|
|
3397
|
+
}
|
|
3398
|
+
function isNonEmptyString2(value) {
|
|
3399
|
+
return typeof value === "string" && value.trim().length > 0;
|
|
3400
|
+
}
|
|
2010
3401
|
function filterEvalCases(evalCases, evalId) {
|
|
2011
3402
|
if (!evalId) {
|
|
2012
3403
|
return evalCases;
|
|
2013
3404
|
}
|
|
2014
3405
|
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
2015
3406
|
}
|
|
2016
|
-
function
|
|
2017
|
-
const
|
|
2018
|
-
const llmJudge = overrides?.llm_judge ?? new QualityGrader({
|
|
3407
|
+
function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
3408
|
+
const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
|
|
2019
3409
|
resolveJudgeProvider: async (context) => {
|
|
2020
3410
|
if (context.judgeProvider) {
|
|
2021
3411
|
return context.judgeProvider;
|
|
@@ -2025,15 +3415,14 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
2025
3415
|
});
|
|
2026
3416
|
return {
|
|
2027
3417
|
...overrides,
|
|
2028
|
-
heuristic,
|
|
2029
3418
|
llm_judge: llmJudge
|
|
2030
3419
|
};
|
|
2031
3420
|
}
|
|
2032
3421
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
2033
3422
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2034
3423
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
2035
|
-
const filePath =
|
|
2036
|
-
await
|
|
3424
|
+
const filePath = path7.resolve(directory, filename);
|
|
3425
|
+
await mkdir2(path7.dirname(filePath), { recursive: true });
|
|
2037
3426
|
const payload = {
|
|
2038
3427
|
eval_id: evalCase.id,
|
|
2039
3428
|
request: promptInputs.request,
|
|
@@ -2050,7 +3439,7 @@ function sanitizeFilename(value) {
|
|
|
2050
3439
|
return sanitized.length > 0 ? sanitized : randomUUID2();
|
|
2051
3440
|
}
|
|
2052
3441
|
async function invokeProvider(provider, options) {
|
|
2053
|
-
const { evalCase,
|
|
3442
|
+
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2054
3443
|
const controller = new AbortController();
|
|
2055
3444
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2056
3445
|
if (signal) {
|
|
@@ -2061,7 +3450,7 @@ async function invokeProvider(provider, options) {
|
|
|
2061
3450
|
prompt: promptInputs.request,
|
|
2062
3451
|
guidelines: promptInputs.guidelines,
|
|
2063
3452
|
guideline_patterns: evalCase.guideline_patterns,
|
|
2064
|
-
|
|
3453
|
+
inputFiles: evalCase.file_paths,
|
|
2065
3454
|
evalCaseId: evalCase.id,
|
|
2066
3455
|
attempt,
|
|
2067
3456
|
metadata: {
|
|
@@ -2129,25 +3518,20 @@ function createAgentKernel() {
|
|
|
2129
3518
|
return { status: "stub" };
|
|
2130
3519
|
}
|
|
2131
3520
|
export {
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
QualityGrader,
|
|
3521
|
+
CodeEvaluator,
|
|
3522
|
+
LlmJudgeEvaluator,
|
|
2135
3523
|
TEST_MESSAGE_ROLES,
|
|
2136
3524
|
buildDirectoryChain,
|
|
2137
3525
|
buildPromptInputs,
|
|
2138
3526
|
buildSearchRoots,
|
|
2139
|
-
calculateHits,
|
|
2140
|
-
calculateMisses,
|
|
2141
3527
|
createAgentKernel,
|
|
2142
3528
|
createProvider,
|
|
2143
3529
|
ensureVSCodeSubagents,
|
|
2144
|
-
extractAspects,
|
|
2145
3530
|
extractCodeBlocks,
|
|
2146
3531
|
fileExists,
|
|
2147
3532
|
findGitRoot,
|
|
2148
3533
|
getHitCount,
|
|
2149
|
-
|
|
2150
|
-
isGraderKind,
|
|
3534
|
+
isEvaluatorKind,
|
|
2151
3535
|
isGuidelineFile,
|
|
2152
3536
|
isJsonObject,
|
|
2153
3537
|
isJsonValue,
|
|
@@ -2160,7 +3544,6 @@ export {
|
|
|
2160
3544
|
resolveFileReference,
|
|
2161
3545
|
resolveTargetDefinition,
|
|
2162
3546
|
runEvalCase,
|
|
2163
|
-
runEvaluation
|
|
2164
|
-
scoreCandidateResponse
|
|
3547
|
+
runEvaluation
|
|
2165
3548
|
};
|
|
2166
3549
|
//# sourceMappingURL=index.js.map
|