@agentv/core 0.2.11 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ import {
5
5
  fileExists,
6
6
  findGitRoot,
7
7
  resolveFileReference
8
- } from "./chunk-P4GOYWYH.js";
8
+ } from "./chunk-NL7K4CAK.js";
9
9
 
10
10
  // src/evaluation/types.ts
11
11
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -48,11 +48,10 @@ function isTestMessage(value) {
48
48
  }
49
49
  return candidate.content.every(isJsonObject);
50
50
  }
51
- var GRADER_KIND_VALUES = ["heuristic", "llm_judge"];
52
- var GRADER_KINDS = GRADER_KIND_VALUES;
53
- var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
54
- function isGraderKind(value) {
55
- return typeof value === "string" && GRADER_KIND_SET.has(value);
51
+ var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
52
+ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
53
+ function isEvaluatorKind(value) {
54
+ return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
56
55
  }
57
56
  function getHitCount(result) {
58
57
  return result.hits.length;
@@ -160,7 +159,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
160
159
  if (!Array.isArray(rawTestcases)) {
161
160
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
162
161
  }
163
- const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
162
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
164
163
  const results = [];
165
164
  for (const rawEvalcase of rawTestcases) {
166
165
  if (!isJsonObject(rawEvalcase)) {
@@ -283,7 +282,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
283
282
  const assistantContent = assistantMessages[0]?.content;
284
283
  const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
285
284
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
286
- const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
285
+ const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
286
+ const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
287
287
  const userFilePaths = [];
288
288
  for (const segment of userSegments) {
289
289
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -306,7 +306,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
306
306
  file_paths: allFilePaths,
307
307
  code_snippets: codeSnippets,
308
308
  outcome,
309
- grader: testCaseGrader
309
+ evaluator: testCaseEvaluatorKind,
310
+ evaluators
310
311
  };
311
312
  if (verbose) {
312
313
  console.log(`
@@ -467,14 +468,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
467
468
  }
468
469
  return parts.join(" ");
469
470
  }
470
- function coerceGrader(candidate) {
471
+ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
472
+ const execution = rawEvalCase.execution;
473
+ const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
474
+ if (candidateEvaluators === void 0) {
475
+ return void 0;
476
+ }
477
+ if (!Array.isArray(candidateEvaluators)) {
478
+ logWarning(`Skipping evaluators for '${evalId}': expected array`);
479
+ return void 0;
480
+ }
481
+ const evaluators = [];
482
+ for (const rawEvaluator of candidateEvaluators) {
483
+ if (!isJsonObject(rawEvaluator)) {
484
+ logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
485
+ continue;
486
+ }
487
+ const name = asString(rawEvaluator.name);
488
+ const typeValue = rawEvaluator.type;
489
+ if (!name || !isEvaluatorKind(typeValue)) {
490
+ logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
491
+ continue;
492
+ }
493
+ if (typeValue === "code") {
494
+ const script = asString(rawEvaluator.script);
495
+ if (!script) {
496
+ logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
497
+ continue;
498
+ }
499
+ const cwd = asString(rawEvaluator.cwd);
500
+ let resolvedCwd;
501
+ if (cwd) {
502
+ const resolved = await resolveFileReference(cwd, searchRoots);
503
+ if (resolved.resolvedPath) {
504
+ resolvedCwd = path.resolve(resolved.resolvedPath);
505
+ } else {
506
+ logWarning(
507
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
508
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
509
+ );
510
+ }
511
+ }
512
+ evaluators.push({
513
+ name,
514
+ type: "code",
515
+ script,
516
+ cwd,
517
+ resolvedCwd
518
+ });
519
+ continue;
520
+ }
521
+ const prompt = asString(rawEvaluator.prompt);
522
+ let promptPath;
523
+ if (prompt) {
524
+ const resolved = await resolveFileReference(prompt, searchRoots);
525
+ if (resolved.resolvedPath) {
526
+ promptPath = path.resolve(resolved.resolvedPath);
527
+ } else {
528
+ logWarning(
529
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
530
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
531
+ );
532
+ }
533
+ }
534
+ const model = asString(rawEvaluator.model);
535
+ evaluators.push({
536
+ name,
537
+ type: "llm_judge",
538
+ prompt,
539
+ promptPath,
540
+ model
541
+ });
542
+ }
543
+ return evaluators.length > 0 ? evaluators : void 0;
544
+ }
545
+ function coerceEvaluator(candidate, contextId) {
471
546
  if (typeof candidate !== "string") {
472
547
  return void 0;
473
548
  }
474
- if (isGraderKind(candidate)) {
549
+ if (isEvaluatorKind(candidate)) {
475
550
  return candidate;
476
551
  }
477
- logWarning(`Unknown grader '${candidate}', falling back to default`);
552
+ logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
478
553
  return void 0;
479
554
  }
480
555
  function logWarning(message, details) {
@@ -670,6 +745,790 @@ var GeminiProvider = class {
670
745
  }
671
746
  };
672
747
 
748
+ // src/evaluation/providers/cli.ts
749
+ import { exec as execWithCallback } from "node:child_process";
750
+ import path2 from "node:path";
751
+ import { promisify } from "node:util";
752
+ var execAsync = promisify(execWithCallback);
753
+ var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
754
+ async function defaultCommandRunner(command, options) {
755
+ const execOptions = {
756
+ cwd: options.cwd,
757
+ env: options.env,
758
+ timeout: options.timeoutMs,
759
+ signal: options.signal,
760
+ maxBuffer: DEFAULT_MAX_BUFFER,
761
+ shell: process.platform === "win32" ? "powershell.exe" : void 0
762
+ };
763
+ try {
764
+ const { stdout, stderr } = await execAsync(command, execOptions);
765
+ return {
766
+ stdout,
767
+ stderr,
768
+ exitCode: 0,
769
+ failed: false,
770
+ timedOut: false,
771
+ signal: null
772
+ };
773
+ } catch (error) {
774
+ const execError = error;
775
+ return {
776
+ stdout: execError.stdout ?? "",
777
+ stderr: execError.stderr ?? "",
778
+ exitCode: typeof execError.code === "number" ? execError.code : null,
779
+ failed: true,
780
+ timedOut: execError.timedOut === true || execError.killed === true,
781
+ signal: execError.signal ?? null
782
+ };
783
+ }
784
+ }
785
+ var CliProvider = class {
786
+ id;
787
+ kind = "cli";
788
+ targetName;
789
+ supportsBatch = false;
790
+ config;
791
+ runCommand;
792
+ healthcheckPromise;
793
+ constructor(targetName, config, runner = defaultCommandRunner) {
794
+ this.targetName = targetName;
795
+ this.id = `cli:${targetName}`;
796
+ this.config = config;
797
+ this.runCommand = runner;
798
+ }
799
+ async invoke(request) {
800
+ if (request.signal?.aborted) {
801
+ throw new Error("CLI provider request was aborted before execution");
802
+ }
803
+ await this.ensureHealthy(request.signal);
804
+ const templateValues = buildTemplateValues(request, this.config);
805
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
806
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
807
+ const result = await this.runCommand(renderedCommand, {
808
+ cwd: this.config.cwd,
809
+ env,
810
+ timeoutMs: this.config.timeoutMs,
811
+ signal: request.signal
812
+ });
813
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
814
+ if (request.signal?.aborted) {
815
+ throw new Error("CLI provider request was aborted");
816
+ }
817
+ if (result.timedOut) {
818
+ throw new Error(
819
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
820
+ );
821
+ }
822
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
823
+ const detail = result.stderr.trim() || result.stdout.trim();
824
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
825
+ throw new Error(message);
826
+ }
827
+ return {
828
+ text: result.stdout,
829
+ raw: {
830
+ command: renderedCommand,
831
+ stderr: result.stderr,
832
+ exitCode: result.exitCode ?? 0,
833
+ cwd: this.config.cwd
834
+ }
835
+ };
836
+ }
837
+ async ensureHealthy(signal) {
838
+ if (!this.config.healthcheck) {
839
+ return;
840
+ }
841
+ if (!this.healthcheckPromise) {
842
+ this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
843
+ }
844
+ return this.healthcheckPromise;
845
+ }
846
+ async runHealthcheck(healthcheck, signal) {
847
+ if (!healthcheck) {
848
+ return;
849
+ }
850
+ const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
851
+ if (healthcheck.type === "http") {
852
+ const controller = new AbortController();
853
+ const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
854
+ signal?.addEventListener("abort", () => controller.abort(), { once: true });
855
+ try {
856
+ const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
857
+ if (!response.ok) {
858
+ throw new Error(`HTTP ${response.status} ${response.statusText}`);
859
+ }
860
+ } catch (error) {
861
+ const reason = error instanceof Error ? error.message : String(error);
862
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
863
+ } finally {
864
+ if (timer !== void 0) {
865
+ clearTimeout(timer);
866
+ }
867
+ }
868
+ return;
869
+ }
870
+ const renderedCommand = renderTemplate(
871
+ healthcheck.commandTemplate,
872
+ buildTemplateValues(
873
+ {
874
+ prompt: "",
875
+ guidelines: "",
876
+ inputFiles: [],
877
+ evalCaseId: "",
878
+ attempt: 0
879
+ },
880
+ this.config
881
+ )
882
+ );
883
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
884
+ const result = await this.runCommand(renderedCommand, {
885
+ cwd: healthcheck.cwd ?? this.config.cwd,
886
+ env,
887
+ timeoutMs,
888
+ signal
889
+ });
890
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
891
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
892
+ const detail = result.stderr.trim() || result.stdout.trim();
893
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
894
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
895
+ }
896
+ }
897
+ };
898
+ function buildTemplateValues(request, config) {
899
+ const inputFiles = normalizeInputFiles(request.inputFiles);
900
+ return {
901
+ PROMPT: shellEscape(request.prompt ?? ""),
902
+ GUIDELINES: shellEscape(request.guidelines ?? ""),
903
+ EVAL_ID: shellEscape(request.evalCaseId ?? ""),
904
+ ATTEMPT: shellEscape(String(request.attempt ?? 0)),
905
+ FILES: formatFileList(inputFiles, config.filesFormat)
906
+ };
907
+ }
908
+ function normalizeInputFiles(inputFiles) {
909
+ if (!inputFiles || inputFiles.length === 0) {
910
+ return void 0;
911
+ }
912
+ const unique = /* @__PURE__ */ new Map();
913
+ for (const inputFile of inputFiles) {
914
+ const absolutePath = path2.resolve(inputFile);
915
+ if (!unique.has(absolutePath)) {
916
+ unique.set(absolutePath, absolutePath);
917
+ }
918
+ }
919
+ return Array.from(unique.values());
920
+ }
921
+ function formatFileList(files, template) {
922
+ if (!files || files.length === 0) {
923
+ return "";
924
+ }
925
+ const formatter = template ?? "{path}";
926
+ return files.map((filePath) => {
927
+ const escapedPath = shellEscape(filePath);
928
+ const escapedName = shellEscape(path2.basename(filePath));
929
+ return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
930
+ }).join(" ");
931
+ }
932
+ function renderTemplate(template, values) {
933
+ return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
934
+ const replacement = values[key];
935
+ return replacement !== void 0 ? replacement : match;
936
+ });
937
+ }
938
+ function shellEscape(value) {
939
+ if (value.length === 0) {
940
+ return "''";
941
+ }
942
+ if (process.platform === "win32") {
943
+ const escaped = value.replace(/"/g, '\\"');
944
+ return `"${escaped}"`;
945
+ }
946
+ return `'${value.replace(/'/g, `'"'"'`)}'`;
947
+ }
948
+ function formatTimeoutSuffix(timeoutMs) {
949
+ if (!timeoutMs || timeoutMs <= 0) {
950
+ return "";
951
+ }
952
+ const seconds = Math.ceil(timeoutMs / 1e3);
953
+ return ` after ${seconds}s`;
954
+ }
955
+
956
+ // src/evaluation/providers/codex.ts
957
+ import { exec as execCallback, spawn } from "node:child_process";
958
+ import { constants as constants2 } from "node:fs";
959
+ import { access as access2, copyFile, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
960
+ import { tmpdir } from "node:os";
961
+ import path4 from "node:path";
962
+ import { promisify as promisify2 } from "node:util";
963
+
964
+ // src/evaluation/providers/preread.ts
965
+ import path3 from "node:path";
966
+ function buildPromptDocument(request, inputFiles, options) {
967
+ const parts = [];
968
+ const guidelineFiles = collectGuidelineFiles(
969
+ inputFiles,
970
+ options?.guidelinePatterns ?? request.guideline_patterns,
971
+ options?.guidelineOverrides
972
+ );
973
+ const inputFilesList = collectInputFiles(inputFiles);
974
+ const nonGuidelineInputFiles = inputFilesList.filter(
975
+ (file) => !guidelineFiles.includes(file)
976
+ );
977
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
978
+ if (prereadBlock.length > 0) {
979
+ parts.push("\n", prereadBlock);
980
+ }
981
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
982
+ return parts.join("\n").trim();
983
+ }
984
+ function normalizeInputFiles2(inputFiles) {
985
+ if (!inputFiles || inputFiles.length === 0) {
986
+ return void 0;
987
+ }
988
+ const deduped = /* @__PURE__ */ new Map();
989
+ for (const inputFile of inputFiles) {
990
+ const absolutePath = path3.resolve(inputFile);
991
+ if (!deduped.has(absolutePath)) {
992
+ deduped.set(absolutePath, absolutePath);
993
+ }
994
+ }
995
+ return Array.from(deduped.values());
996
+ }
997
+ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
998
+ if (!inputFiles || inputFiles.length === 0) {
999
+ return [];
1000
+ }
1001
+ const unique = /* @__PURE__ */ new Map();
1002
+ for (const inputFile of inputFiles) {
1003
+ const absolutePath = path3.resolve(inputFile);
1004
+ if (overrides?.has(absolutePath)) {
1005
+ if (!unique.has(absolutePath)) {
1006
+ unique.set(absolutePath, absolutePath);
1007
+ }
1008
+ continue;
1009
+ }
1010
+ const normalized = absolutePath.split(path3.sep).join("/");
1011
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1012
+ if (!unique.has(absolutePath)) {
1013
+ unique.set(absolutePath, absolutePath);
1014
+ }
1015
+ }
1016
+ }
1017
+ return Array.from(unique.values());
1018
+ }
1019
+ function collectInputFiles(inputFiles) {
1020
+ if (!inputFiles || inputFiles.length === 0) {
1021
+ return [];
1022
+ }
1023
+ const unique = /* @__PURE__ */ new Map();
1024
+ for (const inputFile of inputFiles) {
1025
+ const absolutePath = path3.resolve(inputFile);
1026
+ if (!unique.has(absolutePath)) {
1027
+ unique.set(absolutePath, absolutePath);
1028
+ }
1029
+ }
1030
+ return Array.from(unique.values());
1031
+ }
1032
+ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
1033
+ if (guidelineFiles.length === 0 && inputFiles.length === 0) {
1034
+ return "";
1035
+ }
1036
+ const buildList = (files) => files.map((absolutePath) => {
1037
+ const fileName = path3.basename(absolutePath);
1038
+ const fileUri = pathToFileUri(absolutePath);
1039
+ return `* [${fileName}](${fileUri})`;
1040
+ });
1041
+ const sections = [];
1042
+ if (guidelineFiles.length > 0) {
1043
+ sections.push(`Read all guideline files:
1044
+ ${buildList(guidelineFiles).join("\n")}.`);
1045
+ }
1046
+ if (inputFiles.length > 0) {
1047
+ sections.push(`Read all input files:
1048
+ ${buildList(inputFiles).join("\n")}.`);
1049
+ }
1050
+ sections.push(
1051
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1052
+ "Then apply system_instructions on the user query below."
1053
+ );
1054
+ return sections.join("\n");
1055
+ }
1056
+ function pathToFileUri(filePath) {
1057
+ const absolutePath = path3.isAbsolute(filePath) ? filePath : path3.resolve(filePath);
1058
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
1059
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1060
+ return `file:///${normalizedPath}`;
1061
+ }
1062
+ return `file://${normalizedPath}`;
1063
+ }
1064
+
1065
+ // src/evaluation/providers/codex.ts
1066
+ var execAsync2 = promisify2(execCallback);
1067
+ var WORKSPACE_PREFIX = "agentv-codex-";
1068
+ var PROMPT_FILENAME = "prompt.md";
1069
+ var FILES_DIR = "files";
1070
+ var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
1071
+ var CodexProvider = class {
1072
+ id;
1073
+ kind = "codex";
1074
+ targetName;
1075
+ supportsBatch = false;
1076
+ config;
1077
+ runCodex;
1078
+ environmentCheck;
1079
+ resolvedExecutable;
1080
+ constructor(targetName, config, runner = defaultCodexRunner) {
1081
+ this.id = `codex:${targetName}`;
1082
+ this.targetName = targetName;
1083
+ this.config = config;
1084
+ this.runCodex = runner;
1085
+ }
1086
+ async invoke(request) {
1087
+ if (request.signal?.aborted) {
1088
+ throw new Error("Codex provider request was aborted before execution");
1089
+ }
1090
+ await this.ensureEnvironmentReady();
1091
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
1092
+ const originalGuidelines = new Set(
1093
+ collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => path4.resolve(file))
1094
+ );
1095
+ const workspaceRoot = await this.createWorkspace();
1096
+ try {
1097
+ const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
1098
+ inputFiles,
1099
+ workspaceRoot,
1100
+ originalGuidelines
1101
+ );
1102
+ const promptContent = buildPromptDocument(request, mirroredInputFiles, {
1103
+ guidelinePatterns: request.guideline_patterns,
1104
+ guidelineOverrides: guidelineMirrors
1105
+ });
1106
+ const promptFile = path4.join(workspaceRoot, PROMPT_FILENAME);
1107
+ await writeFile(promptFile, promptContent, "utf8");
1108
+ const args = this.buildCodexArgs();
1109
+ const cwd = this.resolveCwd(workspaceRoot);
1110
+ const result = await this.executeCodex(args, cwd, promptContent, request.signal);
1111
+ if (result.timedOut) {
1112
+ throw new Error(
1113
+ `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
1114
+ );
1115
+ }
1116
+ if (result.exitCode !== 0) {
1117
+ const detail = pickDetail(result.stderr, result.stdout);
1118
+ const prefix = `Codex CLI exited with code ${result.exitCode}`;
1119
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
1120
+ }
1121
+ const parsed = parseCodexJson(result.stdout);
1122
+ const assistantText = extractAssistantText(parsed);
1123
+ return {
1124
+ text: assistantText,
1125
+ raw: {
1126
+ response: parsed,
1127
+ stdout: result.stdout,
1128
+ stderr: result.stderr,
1129
+ exitCode: result.exitCode,
1130
+ args,
1131
+ executable: this.resolvedExecutable ?? this.config.executable,
1132
+ promptFile,
1133
+ workspace: workspaceRoot,
1134
+ inputFiles: mirroredInputFiles
1135
+ }
1136
+ };
1137
+ } finally {
1138
+ await this.cleanupWorkspace(workspaceRoot);
1139
+ }
1140
+ }
1141
+ async ensureEnvironmentReady() {
1142
+ if (!this.environmentCheck) {
1143
+ this.environmentCheck = this.validateEnvironment();
1144
+ }
1145
+ await this.environmentCheck;
1146
+ }
1147
+ async validateEnvironment() {
1148
+ this.resolvedExecutable = await locateExecutable(this.config.executable);
1149
+ }
1150
+ resolveCwd(workspaceRoot) {
1151
+ if (!this.config.cwd) {
1152
+ return workspaceRoot;
1153
+ }
1154
+ return path4.resolve(this.config.cwd);
1155
+ }
1156
+ buildCodexArgs() {
1157
+ const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
1158
+ if (this.config.args && this.config.args.length > 0) {
1159
+ args.push(...this.config.args);
1160
+ }
1161
+ args.push("-");
1162
+ return args;
1163
+ }
1164
+ async executeCodex(args, cwd, promptContent, signal) {
1165
+ try {
1166
+ return await this.runCodex({
1167
+ executable: this.resolvedExecutable ?? this.config.executable,
1168
+ args,
1169
+ cwd,
1170
+ prompt: promptContent,
1171
+ timeoutMs: this.config.timeoutMs,
1172
+ env: process.env,
1173
+ signal
1174
+ });
1175
+ } catch (error) {
1176
+ const err = error;
1177
+ if (err.code === "ENOENT") {
1178
+ throw new Error(
1179
+ `Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
1180
+ );
1181
+ }
1182
+ throw error;
1183
+ }
1184
+ }
1185
+ async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
1186
+ if (!inputFiles || inputFiles.length === 0) {
1187
+ return {
1188
+ mirroredInputFiles: void 0,
1189
+ guidelineMirrors: /* @__PURE__ */ new Set()
1190
+ };
1191
+ }
1192
+ const filesRoot = path4.join(workspaceRoot, FILES_DIR);
1193
+ await mkdir(filesRoot, { recursive: true });
1194
+ const mirrored = [];
1195
+ const guidelineMirrors = /* @__PURE__ */ new Set();
1196
+ const nameCounts = /* @__PURE__ */ new Map();
1197
+ for (const inputFile of inputFiles) {
1198
+ const absoluteSource = path4.resolve(inputFile);
1199
+ const baseName = path4.basename(absoluteSource);
1200
+ const count = nameCounts.get(baseName) ?? 0;
1201
+ nameCounts.set(baseName, count + 1);
1202
+ const finalName = count === 0 ? baseName : `${baseName}.${count}`;
1203
+ const destination = path4.join(filesRoot, finalName);
1204
+ await copyFile(absoluteSource, destination);
1205
+ const resolvedDestination = path4.resolve(destination);
1206
+ mirrored.push(resolvedDestination);
1207
+ if (guidelineOriginals.has(absoluteSource)) {
1208
+ guidelineMirrors.add(resolvedDestination);
1209
+ }
1210
+ }
1211
+ return {
1212
+ mirroredInputFiles: mirrored,
1213
+ guidelineMirrors
1214
+ };
1215
+ }
1216
+ async createWorkspace() {
1217
+ return await mkdtemp(path4.join(tmpdir(), WORKSPACE_PREFIX));
1218
+ }
1219
+ async cleanupWorkspace(workspaceRoot) {
1220
+ try {
1221
+ await rm(workspaceRoot, { recursive: true, force: true });
1222
+ } catch {
1223
+ }
1224
+ }
1225
+ };
1226
+ async function locateExecutable(candidate) {
1227
+ const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
1228
+ if (includesPathSeparator) {
1229
+ const resolved = path4.isAbsolute(candidate) ? candidate : path4.resolve(candidate);
1230
+ const executablePath = await ensureWindowsExecutableVariant(resolved);
1231
+ await access2(executablePath, constants2.F_OK);
1232
+ return executablePath;
1233
+ }
1234
+ const locator = process.platform === "win32" ? "where" : "which";
1235
+ try {
1236
+ const { stdout } = await execAsync2(`${locator} ${candidate}`);
1237
+ const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
1238
+ const preferred = selectExecutableCandidate(lines);
1239
+ if (preferred) {
1240
+ const executablePath = await ensureWindowsExecutableVariant(preferred);
1241
+ await access2(executablePath, constants2.F_OK);
1242
+ return executablePath;
1243
+ }
1244
+ } catch {
1245
+ }
1246
+ throw new Error(`Codex executable '${candidate}' was not found on PATH`);
1247
+ }
1248
+ function selectExecutableCandidate(candidates) {
1249
+ if (candidates.length === 0) {
1250
+ return void 0;
1251
+ }
1252
+ if (process.platform !== "win32") {
1253
+ return candidates[0];
1254
+ }
1255
+ const extensions = getWindowsExecutableExtensions();
1256
+ for (const ext of extensions) {
1257
+ const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
1258
+ if (match) {
1259
+ return match;
1260
+ }
1261
+ }
1262
+ return candidates[0];
1263
+ }
1264
+ async function ensureWindowsExecutableVariant(candidate) {
1265
+ if (process.platform !== "win32") {
1266
+ return candidate;
1267
+ }
1268
+ if (hasExecutableExtension(candidate)) {
1269
+ return candidate;
1270
+ }
1271
+ const extensions = getWindowsExecutableExtensions();
1272
+ for (const ext of extensions) {
1273
+ const withExtension = `${candidate}${ext}`;
1274
+ try {
1275
+ await access2(withExtension, constants2.F_OK);
1276
+ return withExtension;
1277
+ } catch {
1278
+ }
1279
+ }
1280
+ return candidate;
1281
+ }
1282
+ function hasExecutableExtension(candidate) {
1283
+ const lower = candidate.toLowerCase();
1284
+ return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
1285
+ }
1286
+ var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
1287
+ function getWindowsExecutableExtensions() {
1288
+ if (process.platform !== "win32") {
1289
+ return [];
1290
+ }
1291
+ const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
1292
+ return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
1293
+ }
1294
+ function parseCodexJson(output) {
1295
+ const trimmed = output.trim();
1296
+ if (trimmed.length === 0) {
1297
+ throw new Error("Codex CLI produced no output in --json mode");
1298
+ }
1299
+ try {
1300
+ return JSON.parse(trimmed);
1301
+ } catch {
1302
+ const lineObjects = parseJsonLines(trimmed);
1303
+ if (lineObjects) {
1304
+ return lineObjects;
1305
+ }
1306
+ const lastBrace = trimmed.lastIndexOf("{");
1307
+ if (lastBrace >= 0) {
1308
+ const candidate = trimmed.slice(lastBrace);
1309
+ try {
1310
+ return JSON.parse(candidate);
1311
+ } catch {
1312
+ }
1313
+ }
1314
+ const preview = trimmed.slice(0, 200);
1315
+ throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
1316
+ }
1317
+ }
1318
+ function extractAssistantText(parsed) {
1319
+ if (Array.isArray(parsed)) {
1320
+ const text = extractFromEventStream(parsed);
1321
+ if (text) {
1322
+ return text;
1323
+ }
1324
+ }
1325
+ if (!parsed || typeof parsed !== "object") {
1326
+ throw new Error("Codex CLI JSON response did not include an assistant message");
1327
+ }
1328
+ const record = parsed;
1329
+ const eventText = extractFromEvent(record);
1330
+ if (eventText) {
1331
+ return eventText;
1332
+ }
1333
+ const messages = Array.isArray(record.messages) ? record.messages : void 0;
1334
+ if (messages) {
1335
+ for (let index = messages.length - 1; index >= 0; index -= 1) {
1336
+ const entry = messages[index];
1337
+ if (!entry || typeof entry !== "object") {
1338
+ continue;
1339
+ }
1340
+ const role = entry.role;
1341
+ if (role !== "assistant") {
1342
+ continue;
1343
+ }
1344
+ const content = entry.content;
1345
+ const flattened = flattenContent(content);
1346
+ if (flattened) {
1347
+ return flattened;
1348
+ }
1349
+ }
1350
+ }
1351
+ const response = record.response;
1352
+ if (response && typeof response === "object") {
1353
+ const content = response.content;
1354
+ const flattened = flattenContent(content);
1355
+ if (flattened) {
1356
+ return flattened;
1357
+ }
1358
+ }
1359
+ const output = record.output;
1360
+ const flattenedOutput = flattenContent(output);
1361
+ if (flattenedOutput) {
1362
+ return flattenedOutput;
1363
+ }
1364
+ throw new Error("Codex CLI JSON response did not include an assistant message");
1365
+ }
1366
+ function extractFromEventStream(events) {
1367
+ for (let index = events.length - 1; index >= 0; index -= 1) {
1368
+ const candidate = events[index];
1369
+ const text = extractFromEvent(candidate);
1370
+ if (text) {
1371
+ return text;
1372
+ }
1373
+ }
1374
+ return void 0;
1375
+ }
1376
+ function extractFromEvent(event) {
1377
+ if (!event || typeof event !== "object") {
1378
+ return void 0;
1379
+ }
1380
+ const record = event;
1381
+ const type = typeof record.type === "string" ? record.type : void 0;
1382
+ if (type === JSONL_TYPE_ITEM_COMPLETED) {
1383
+ const item = record.item;
1384
+ const text = extractFromItem(item);
1385
+ if (text) {
1386
+ return text;
1387
+ }
1388
+ }
1389
+ const output = record.output ?? record.content;
1390
+ const flattened = flattenContent(output);
1391
+ if (flattened) {
1392
+ return flattened;
1393
+ }
1394
+ return void 0;
1395
+ }
1396
+ function extractFromItem(item) {
1397
+ if (!item || typeof item !== "object") {
1398
+ return void 0;
1399
+ }
1400
+ const record = item;
1401
+ const itemType = typeof record.type === "string" ? record.type : void 0;
1402
+ if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
1403
+ const text = flattenContent(record.text ?? record.content ?? record.output);
1404
+ if (text) {
1405
+ return text;
1406
+ }
1407
+ }
1408
+ return void 0;
1409
+ }
1410
+ function flattenContent(value) {
1411
+ if (typeof value === "string") {
1412
+ return value;
1413
+ }
1414
+ if (Array.isArray(value)) {
1415
+ const parts = value.map((segment) => {
1416
+ if (typeof segment === "string") {
1417
+ return segment;
1418
+ }
1419
+ if (segment && typeof segment === "object" && "text" in segment) {
1420
+ const text = segment.text;
1421
+ return typeof text === "string" ? text : void 0;
1422
+ }
1423
+ return void 0;
1424
+ }).filter((part) => typeof part === "string" && part.length > 0);
1425
+ return parts.length > 0 ? parts.join(" \n") : void 0;
1426
+ }
1427
+ if (value && typeof value === "object" && "text" in value) {
1428
+ const text = value.text;
1429
+ return typeof text === "string" ? text : void 0;
1430
+ }
1431
+ return void 0;
1432
+ }
1433
+ function parseJsonLines(output) {
1434
+ const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
1435
+ if (lines.length <= 1) {
1436
+ return void 0;
1437
+ }
1438
+ const parsed = [];
1439
+ for (const line of lines) {
1440
+ try {
1441
+ parsed.push(JSON.parse(line));
1442
+ } catch {
1443
+ return void 0;
1444
+ }
1445
+ }
1446
+ return parsed;
1447
+ }
1448
+ function pickDetail(stderr, stdout) {
1449
+ const errorText = stderr.trim();
1450
+ if (errorText.length > 0) {
1451
+ return errorText;
1452
+ }
1453
+ const stdoutText = stdout.trim();
1454
+ return stdoutText.length > 0 ? stdoutText : void 0;
1455
+ }
1456
+ function formatTimeoutSuffix2(timeoutMs) {
1457
+ if (!timeoutMs || timeoutMs <= 0) {
1458
+ return "";
1459
+ }
1460
+ const seconds = Math.ceil(timeoutMs / 1e3);
1461
+ return ` after ${seconds}s`;
1462
+ }
1463
+ async function defaultCodexRunner(options) {
1464
+ return await new Promise((resolve, reject) => {
1465
+ const child = spawn(options.executable, options.args, {
1466
+ cwd: options.cwd,
1467
+ env: options.env,
1468
+ stdio: ["pipe", "pipe", "pipe"],
1469
+ shell: shouldShellExecute(options.executable)
1470
+ });
1471
+ let stdout = "";
1472
+ let stderr = "";
1473
+ let timedOut = false;
1474
+ const onAbort = () => {
1475
+ child.kill("SIGTERM");
1476
+ };
1477
+ if (options.signal) {
1478
+ if (options.signal.aborted) {
1479
+ onAbort();
1480
+ } else {
1481
+ options.signal.addEventListener("abort", onAbort, { once: true });
1482
+ }
1483
+ }
1484
+ let timeoutHandle;
1485
+ if (options.timeoutMs && options.timeoutMs > 0) {
1486
+ timeoutHandle = setTimeout(() => {
1487
+ timedOut = true;
1488
+ child.kill("SIGTERM");
1489
+ }, options.timeoutMs);
1490
+ timeoutHandle.unref?.();
1491
+ }
1492
+ child.stdout.setEncoding("utf8");
1493
+ child.stdout.on("data", (chunk) => {
1494
+ stdout += chunk;
1495
+ });
1496
+ child.stderr.setEncoding("utf8");
1497
+ child.stderr.on("data", (chunk) => {
1498
+ stderr += chunk;
1499
+ });
1500
+ child.stdin.end(options.prompt);
1501
+ const cleanup = () => {
1502
+ if (timeoutHandle) {
1503
+ clearTimeout(timeoutHandle);
1504
+ }
1505
+ if (options.signal) {
1506
+ options.signal.removeEventListener("abort", onAbort);
1507
+ }
1508
+ };
1509
+ child.on("error", (error) => {
1510
+ cleanup();
1511
+ reject(error);
1512
+ });
1513
+ child.on("close", (code) => {
1514
+ cleanup();
1515
+ resolve({
1516
+ stdout,
1517
+ stderr,
1518
+ exitCode: typeof code === "number" ? code : -1,
1519
+ timedOut
1520
+ });
1521
+ });
1522
+ });
1523
+ }
1524
+ function shouldShellExecute(executable) {
1525
+ if (process.platform !== "win32") {
1526
+ return false;
1527
+ }
1528
+ const lower = executable.toLowerCase();
1529
+ return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
1530
+ }
1531
+
673
1532
  // src/evaluation/providers/mock.ts
674
1533
  var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
675
1534
  var MockProvider = class {
@@ -713,6 +1572,7 @@ var MockProvider = class {
713
1572
 
714
1573
  // src/evaluation/providers/targets.ts
715
1574
  import { z } from "zod";
1575
+ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
716
1576
  var BASE_TARGET_SCHEMA = z.object({
717
1577
  name: z.string().min(1, "target name is required"),
718
1578
  provider: z.string().min(1, "provider is required"),
@@ -769,6 +1629,16 @@ function resolveTargetDefinition(definition, env = process.env) {
769
1629
  providerBatching,
770
1630
  config: resolveGeminiConfig(parsed, env)
771
1631
  };
1632
+ case "codex":
1633
+ case "codex-cli":
1634
+ return {
1635
+ kind: "codex",
1636
+ name: parsed.name,
1637
+ judgeTarget: parsed.judge_target,
1638
+ workers: parsed.workers,
1639
+ providerBatching,
1640
+ config: resolveCodexConfig(parsed, env)
1641
+ };
772
1642
  case "mock":
773
1643
  return {
774
1644
  kind: "mock",
@@ -788,6 +1658,15 @@ function resolveTargetDefinition(definition, env = process.env) {
788
1658
  providerBatching,
789
1659
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
790
1660
  };
1661
+ case "cli":
1662
+ return {
1663
+ kind: "cli",
1664
+ name: parsed.name,
1665
+ judgeTarget: parsed.judge_target,
1666
+ workers: parsed.workers,
1667
+ providerBatching,
1668
+ config: resolveCliConfig(parsed, env)
1669
+ };
791
1670
  default:
792
1671
  throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
793
1672
  }
@@ -855,6 +1734,29 @@ function resolveGeminiConfig(target, env) {
855
1734
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
856
1735
  };
857
1736
  }
1737
+ function resolveCodexConfig(target, env) {
1738
+ const settings = target.settings ?? {};
1739
+ const executableSource = settings.executable ?? settings.command ?? settings.binary;
1740
+ const argsSource = settings.args ?? settings.arguments;
1741
+ const cwdSource = settings.cwd;
1742
+ const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
1743
+ const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
1744
+ allowLiteral: true,
1745
+ optionalEnv: true
1746
+ }) ?? "codex";
1747
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
1748
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
1749
+ allowLiteral: true,
1750
+ optionalEnv: true
1751
+ });
1752
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
1753
+ return {
1754
+ executable,
1755
+ args,
1756
+ cwd,
1757
+ timeoutMs
1758
+ };
1759
+ }
858
1760
  function resolveMockConfig(target) {
859
1761
  const settings = target.settings ?? {};
860
1762
  const response = typeof settings.response === "string" ? settings.response : void 0;
@@ -884,6 +1786,125 @@ function resolveVSCodeConfig(target, env, insiders) {
884
1786
  workspaceTemplate
885
1787
  };
886
1788
  }
1789
+ function resolveCliConfig(target, env) {
1790
+ const settings = target.settings ?? {};
1791
+ const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
1792
+ const filesFormat = resolveOptionalLiteralString(
1793
+ settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
1794
+ );
1795
+ const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
1796
+ allowLiteral: true,
1797
+ optionalEnv: true
1798
+ });
1799
+ const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
1800
+ const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
1801
+ const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
1802
+ const commandTemplate = resolveString(
1803
+ commandTemplateSource,
1804
+ env,
1805
+ `${target.name} CLI command template`,
1806
+ true
1807
+ );
1808
+ assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
1809
+ return {
1810
+ commandTemplate,
1811
+ filesFormat,
1812
+ cwd,
1813
+ env: envOverrides,
1814
+ timeoutMs,
1815
+ healthcheck
1816
+ };
1817
+ }
1818
+ function resolveEnvOverrides(source, env, targetName) {
1819
+ if (source === void 0 || source === null) {
1820
+ return void 0;
1821
+ }
1822
+ if (typeof source !== "object" || Array.isArray(source)) {
1823
+ throw new Error(`${targetName} env overrides must be an object map of strings`);
1824
+ }
1825
+ const entries = Object.entries(source);
1826
+ const resolved = {};
1827
+ for (const [key, value] of entries) {
1828
+ if (typeof value !== "string") {
1829
+ throw new Error(`${targetName} env override '${key}' must be a string`);
1830
+ }
1831
+ const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
1832
+ resolved[key] = resolvedValue;
1833
+ }
1834
+ return Object.keys(resolved).length > 0 ? resolved : void 0;
1835
+ }
1836
+ function resolveTimeoutMs(source, description) {
1837
+ const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
1838
+ if (seconds === void 0) {
1839
+ return void 0;
1840
+ }
1841
+ if (seconds <= 0) {
1842
+ throw new Error(`${description} must be greater than zero seconds`);
1843
+ }
1844
+ return Math.floor(seconds * 1e3);
1845
+ }
1846
+ function resolveCliHealthcheck(source, env, targetName) {
1847
+ if (source === void 0 || source === null) {
1848
+ return void 0;
1849
+ }
1850
+ if (typeof source !== "object" || Array.isArray(source)) {
1851
+ throw new Error(`${targetName} healthcheck must be an object`);
1852
+ }
1853
+ const candidate = source;
1854
+ const type = candidate.type;
1855
+ const timeoutMs = resolveTimeoutMs(
1856
+ candidate.timeout_seconds ?? candidate.timeoutSeconds,
1857
+ `${targetName} healthcheck timeout`
1858
+ );
1859
+ if (type === "http") {
1860
+ const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
1861
+ return {
1862
+ type: "http",
1863
+ url,
1864
+ timeoutMs
1865
+ };
1866
+ }
1867
+ if (type === "command") {
1868
+ const commandTemplate = resolveString(
1869
+ candidate.command_template ?? candidate.commandTemplate,
1870
+ env,
1871
+ `${targetName} healthcheck command template`,
1872
+ true
1873
+ );
1874
+ assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
1875
+ const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
1876
+ allowLiteral: true,
1877
+ optionalEnv: true
1878
+ });
1879
+ return {
1880
+ type: "command",
1881
+ commandTemplate,
1882
+ timeoutMs,
1883
+ cwd
1884
+ };
1885
+ }
1886
+ throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
1887
+ }
1888
+ function assertSupportedCliPlaceholders(template, description) {
1889
+ const placeholders = extractCliPlaceholders(template);
1890
+ for (const placeholder of placeholders) {
1891
+ if (!CLI_PLACEHOLDERS.has(placeholder)) {
1892
+ throw new Error(
1893
+ `${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
1894
+ );
1895
+ }
1896
+ }
1897
+ }
1898
+ function extractCliPlaceholders(template) {
1899
+ const matches = template.matchAll(/\{([A-Z_]+)\}/g);
1900
+ const results = [];
1901
+ for (const match of matches) {
1902
+ if (match[1]) {
1903
+ results.push(match[1]);
1904
+ }
1905
+ }
1906
+ return results;
1907
+ }
887
1908
  function resolveString(source, env, description, allowLiteral = false) {
888
1909
  const value = resolveOptionalString(source, env, description, {
889
1910
  allowLiteral,
@@ -914,11 +1935,14 @@ function resolveOptionalString(source, env, description, options) {
914
1935
  }
915
1936
  const allowLiteral = options?.allowLiteral ?? false;
916
1937
  const optionalEnv = options?.optionalEnv ?? false;
917
- if (!allowLiteral && isLikelyEnvReference(trimmed)) {
1938
+ const looksLikeEnv = isLikelyEnvReference(trimmed);
1939
+ if (looksLikeEnv) {
918
1940
  if (optionalEnv) {
919
1941
  return void 0;
920
1942
  }
921
- throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
1943
+ if (!allowLiteral) {
1944
+ throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
1945
+ }
922
1946
  }
923
1947
  return trimmed;
924
1948
  }
@@ -968,16 +1992,43 @@ function resolveOptionalBoolean(source) {
968
1992
  function isLikelyEnvReference(value) {
969
1993
  return /^[A-Z0-9_]+$/.test(value);
970
1994
  }
1995
+ function resolveOptionalStringArray(source, env, description) {
1996
+ if (source === void 0 || source === null) {
1997
+ return void 0;
1998
+ }
1999
+ if (!Array.isArray(source)) {
2000
+ throw new Error(`${description} must be an array of strings`);
2001
+ }
2002
+ if (source.length === 0) {
2003
+ return void 0;
2004
+ }
2005
+ const resolved = [];
2006
+ for (let i = 0; i < source.length; i++) {
2007
+ const item = source[i];
2008
+ if (typeof item !== "string") {
2009
+ throw new Error(`${description}[${i}] must be a string`);
2010
+ }
2011
+ const trimmed = item.trim();
2012
+ if (trimmed.length === 0) {
2013
+ throw new Error(`${description}[${i}] cannot be empty`);
2014
+ }
2015
+ const envValue = env[trimmed];
2016
+ if (envValue !== void 0) {
2017
+ if (envValue.trim().length === 0) {
2018
+ throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
2019
+ }
2020
+ resolved.push(envValue);
2021
+ } else {
2022
+ resolved.push(trimmed);
2023
+ }
2024
+ }
2025
+ return resolved.length > 0 ? resolved : void 0;
2026
+ }
971
2027
 
972
2028
  // src/evaluation/providers/vscode.ts
973
2029
  import { readFile as readFile2 } from "node:fs/promises";
974
- import path2 from "node:path";
975
- import {
976
- dispatchAgentSession,
977
- dispatchBatchAgent,
978
- getSubagentRoot,
979
- provisionSubagents
980
- } from "subagent";
2030
+ import path5 from "node:path";
2031
+ import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
981
2032
  var VSCodeProvider = class {
982
2033
  id;
983
2034
  kind;
@@ -994,12 +2045,11 @@ var VSCodeProvider = class {
994
2045
  if (request.signal?.aborted) {
995
2046
  throw new Error("VS Code provider request was aborted before dispatch");
996
2047
  }
997
- const attachments = normalizeAttachments(request.attachments);
998
- const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
2048
+ const inputFiles = normalizeAttachments(request.inputFiles);
2049
+ const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
999
2050
  const session = await dispatchAgentSession({
1000
2051
  userQuery: promptContent,
1001
- // Use full prompt content instead of just request.prompt
1002
- extraAttachments: attachments,
2052
+ extraAttachments: inputFiles,
1003
2053
  wait: this.config.waitForResponse,
1004
2054
  dryRun: this.config.dryRun,
1005
2055
  vscodeCmd: this.config.command,
@@ -1016,7 +2066,7 @@ var VSCodeProvider = class {
1016
2066
  text: "",
1017
2067
  raw: {
1018
2068
  session,
1019
- attachments
2069
+ inputFiles
1020
2070
  }
1021
2071
  };
1022
2072
  }
@@ -1025,7 +2075,7 @@ var VSCodeProvider = class {
1025
2075
  text: responseText,
1026
2076
  raw: {
1027
2077
  session,
1028
- attachments
2078
+ inputFiles
1029
2079
  }
1030
2080
  };
1031
2081
  }
@@ -1035,17 +2085,17 @@ var VSCodeProvider = class {
1035
2085
  }
1036
2086
  const normalizedRequests = requests.map((req) => ({
1037
2087
  request: req,
1038
- attachments: normalizeAttachments(req.attachments)
2088
+ inputFiles: normalizeAttachments(req.inputFiles)
1039
2089
  }));
1040
- const combinedAttachments = mergeAttachments(
1041
- normalizedRequests.map(({ attachments }) => attachments)
2090
+ const combinedInputFiles = mergeAttachments(
2091
+ normalizedRequests.map(({ inputFiles }) => inputFiles)
1042
2092
  );
1043
2093
  const userQueries = normalizedRequests.map(
1044
- ({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
2094
+ ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
1045
2095
  );
1046
2096
  const session = await dispatchBatchAgent({
1047
2097
  userQueries,
1048
- extraAttachments: combinedAttachments,
2098
+ extraAttachments: combinedInputFiles,
1049
2099
  wait: this.config.waitForResponse,
1050
2100
  dryRun: this.config.dryRun,
1051
2101
  vscodeCmd: this.config.command,
@@ -1058,12 +2108,12 @@ var VSCodeProvider = class {
1058
2108
  throw new Error(failure);
1059
2109
  }
1060
2110
  if (this.config.dryRun) {
1061
- return normalizedRequests.map(({ attachments }) => ({
2111
+ return normalizedRequests.map(({ inputFiles }) => ({
1062
2112
  text: "",
1063
2113
  raw: {
1064
2114
  session,
1065
- attachments,
1066
- allAttachments: combinedAttachments
2115
+ inputFiles,
2116
+ allInputFiles: combinedInputFiles
1067
2117
  }
1068
2118
  }));
1069
2119
  }
@@ -1079,8 +2129,8 @@ var VSCodeProvider = class {
1079
2129
  text: responseText,
1080
2130
  raw: {
1081
2131
  session,
1082
- attachments: normalizedRequests[index]?.attachments,
1083
- allAttachments: combinedAttachments,
2132
+ inputFiles: normalizedRequests[index]?.inputFiles,
2133
+ allInputFiles: combinedInputFiles,
1084
2134
  responseFile
1085
2135
  }
1086
2136
  });
@@ -1088,27 +2138,27 @@ var VSCodeProvider = class {
1088
2138
  return responses;
1089
2139
  }
1090
2140
  };
1091
- function buildPromptDocument(request, attachments, guidelinePatterns) {
2141
+ function buildPromptDocument2(request, attachments, guidelinePatterns) {
1092
2142
  const parts = [];
1093
- const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
2143
+ const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
1094
2144
  const attachmentFiles = collectAttachmentFiles(attachments);
1095
2145
  const nonGuidelineAttachments = attachmentFiles.filter(
1096
2146
  (file) => !guidelineFiles.includes(file)
1097
2147
  );
1098
- const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
2148
+ const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
1099
2149
  if (prereadBlock.length > 0) {
1100
2150
  parts.push("\n", prereadBlock);
1101
2151
  }
1102
2152
  parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1103
2153
  return parts.join("\n").trim();
1104
2154
  }
1105
- function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
2155
+ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
1106
2156
  if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
1107
2157
  return "";
1108
2158
  }
1109
2159
  const buildList = (files) => files.map((absolutePath) => {
1110
- const fileName = path2.basename(absolutePath);
1111
- const fileUri = pathToFileUri(absolutePath);
2160
+ const fileName = path5.basename(absolutePath);
2161
+ const fileUri = pathToFileUri2(absolutePath);
1112
2162
  return `* [${fileName}](${fileUri})`;
1113
2163
  });
1114
2164
  const sections = [];
@@ -1126,14 +2176,14 @@ ${buildList(attachmentFiles).join("\n")}.`);
1126
2176
  );
1127
2177
  return sections.join("\n");
1128
2178
  }
1129
- function collectGuidelineFiles(attachments, guidelinePatterns) {
2179
+ function collectGuidelineFiles2(attachments, guidelinePatterns) {
1130
2180
  if (!attachments || attachments.length === 0) {
1131
2181
  return [];
1132
2182
  }
1133
2183
  const unique = /* @__PURE__ */ new Map();
1134
2184
  for (const attachment of attachments) {
1135
- const absolutePath = path2.resolve(attachment);
1136
- const normalized = absolutePath.split(path2.sep).join("/");
2185
+ const absolutePath = path5.resolve(attachment);
2186
+ const normalized = absolutePath.split(path5.sep).join("/");
1137
2187
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1138
2188
  if (!unique.has(absolutePath)) {
1139
2189
  unique.set(absolutePath, absolutePath);
@@ -1148,15 +2198,15 @@ function collectAttachmentFiles(attachments) {
1148
2198
  }
1149
2199
  const unique = /* @__PURE__ */ new Map();
1150
2200
  for (const attachment of attachments) {
1151
- const absolutePath = path2.resolve(attachment);
2201
+ const absolutePath = path5.resolve(attachment);
1152
2202
  if (!unique.has(absolutePath)) {
1153
2203
  unique.set(absolutePath, absolutePath);
1154
2204
  }
1155
2205
  }
1156
2206
  return Array.from(unique.values());
1157
2207
  }
1158
- function pathToFileUri(filePath) {
1159
- const absolutePath = path2.isAbsolute(filePath) ? filePath : path2.resolve(filePath);
2208
+ function pathToFileUri2(filePath) {
2209
+ const absolutePath = path5.isAbsolute(filePath) ? filePath : path5.resolve(filePath);
1160
2210
  const normalizedPath = absolutePath.replace(/\\/g, "/");
1161
2211
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1162
2212
  return `file:///${normalizedPath}`;
@@ -1169,7 +2219,7 @@ function normalizeAttachments(attachments) {
1169
2219
  }
1170
2220
  const deduped = /* @__PURE__ */ new Set();
1171
2221
  for (const attachment of attachments) {
1172
- deduped.add(path2.resolve(attachment));
2222
+ deduped.add(path5.resolve(attachment));
1173
2223
  }
1174
2224
  return Array.from(deduped);
1175
2225
  }
@@ -1177,8 +2227,8 @@ function mergeAttachments(all) {
1177
2227
  const deduped = /* @__PURE__ */ new Set();
1178
2228
  for (const list of all) {
1179
2229
  if (!list) continue;
1180
- for (const attachment of list) {
1181
- deduped.add(path2.resolve(attachment));
2230
+ for (const inputFile of list) {
2231
+ deduped.add(path5.resolve(inputFile));
1182
2232
  }
1183
2233
  }
1184
2234
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -1223,9 +2273,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
1223
2273
  }
1224
2274
 
1225
2275
  // src/evaluation/providers/targets-file.ts
1226
- import { constants as constants2 } from "node:fs";
1227
- import { access as access2, readFile as readFile3 } from "node:fs/promises";
1228
- import path3 from "node:path";
2276
+ import { constants as constants3 } from "node:fs";
2277
+ import { access as access3, readFile as readFile3 } from "node:fs/promises";
2278
+ import path6 from "node:path";
1229
2279
  import { parse as parse2 } from "yaml";
1230
2280
  function isRecord(value) {
1231
2281
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -1281,14 +2331,14 @@ function assertTargetDefinition(value, index, filePath) {
1281
2331
  }
1282
2332
  async function fileExists3(filePath) {
1283
2333
  try {
1284
- await access2(filePath, constants2.F_OK);
2334
+ await access3(filePath, constants3.F_OK);
1285
2335
  return true;
1286
2336
  } catch {
1287
2337
  return false;
1288
2338
  }
1289
2339
  }
1290
2340
  async function readTargetDefinitions(filePath) {
1291
- const absolutePath = path3.resolve(filePath);
2341
+ const absolutePath = path6.resolve(filePath);
1292
2342
  if (!await fileExists3(absolutePath)) {
1293
2343
  throw new Error(`targets.yaml not found at ${absolutePath}`);
1294
2344
  }
@@ -1315,6 +2365,10 @@ function createProvider(target) {
1315
2365
  return new AnthropicProvider(target.name, target.config);
1316
2366
  case "gemini":
1317
2367
  return new GeminiProvider(target.name, target.config);
2368
+ case "cli":
2369
+ return new CliProvider(target.name, target.config);
2370
+ case "codex":
2371
+ return new CodexProvider(target.name, target.config);
1318
2372
  case "mock":
1319
2373
  return new MockProvider(target.name, target.config);
1320
2374
  case "vscode":
@@ -1331,230 +2385,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
1331
2385
  return createProvider(resolved);
1332
2386
  }
1333
2387
 
1334
- // src/evaluation/scoring.ts
1335
- var KEY_TERM_MATCH_THRESHOLD = 0.5;
1336
- var ACTION_WORDS = /* @__PURE__ */ new Set([
1337
- "use",
1338
- "avoid",
1339
- "prefer",
1340
- "replace",
1341
- "consider",
1342
- "ensure",
1343
- "remove",
1344
- "add"
1345
- ]);
1346
- var STOP_WORDS = /* @__PURE__ */ new Set([
1347
- "the",
1348
- "a",
1349
- "an",
1350
- "and",
1351
- "or",
1352
- "but",
1353
- "in",
1354
- "on",
1355
- "at",
1356
- "to",
1357
- "for",
1358
- "of",
1359
- "with",
1360
- "by",
1361
- "is",
1362
- "are",
1363
- "was",
1364
- "were",
1365
- "be",
1366
- "been",
1367
- "being",
1368
- "have",
1369
- "has",
1370
- "had",
1371
- "do",
1372
- "does",
1373
- "did",
1374
- "will",
1375
- "would",
1376
- "could",
1377
- "should"
1378
- ]);
1379
- var ERROR_PREFIXES = [
1380
- "error:",
1381
- "err:",
1382
- "vs code command failed",
1383
- "exception",
1384
- "traceback",
1385
- "no response file was generated",
1386
- "timed out",
1387
- "cli not found"
1388
- ];
1389
- function extractAspects(expectedResponse) {
1390
- const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
1391
- const aspects = [];
1392
- for (const line of lines) {
1393
- if (line.length === 0) {
1394
- continue;
1395
- }
1396
- const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
1397
- if (bulletMatch) {
1398
- const normalized = normalizeAspect(bulletMatch[2]);
1399
- if (normalized.length > 0) {
1400
- aspects.push(normalized);
1401
- }
1402
- continue;
1403
- }
1404
- const lowered = line.toLowerCase();
1405
- if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
1406
- const normalized = normalizeAspect(line);
1407
- if (normalized.length > 0) {
1408
- aspects.push(normalized);
1409
- }
1410
- }
1411
- }
1412
- return aspects;
1413
- }
1414
- function calculateHits(candidateResponse, expectedAspects) {
1415
- const { normalizedText, words } = normalizeCandidate(candidateResponse);
1416
- const hits = [];
1417
- for (const aspect of expectedAspects) {
1418
- if (matchesAspect(aspect, normalizedText, words)) {
1419
- hits.push(aspect);
1420
- }
1421
- }
1422
- return hits;
1423
- }
1424
- function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
1425
- const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
1426
- return expectedAspects.filter((aspect) => !hits.has(aspect));
1427
- }
1428
- function scoreCandidateResponse(candidateResponse, expectedAspects) {
1429
- if (expectedAspects.length === 0) {
1430
- if (isErrorLike(candidateResponse)) {
1431
- return {
1432
- score: 0,
1433
- hits: [],
1434
- misses: ["Model produced an error instead of an answer."],
1435
- hitCount: 0,
1436
- totalAspects: 0,
1437
- rawAspects: []
1438
- };
1439
- }
1440
- return {
1441
- score: 1,
1442
- hits: [],
1443
- misses: [],
1444
- hitCount: 0,
1445
- totalAspects: 0,
1446
- rawAspects: []
1447
- };
1448
- }
1449
- const hits = calculateHits(candidateResponse, expectedAspects);
1450
- const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
1451
- const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
1452
- return {
1453
- score,
1454
- hits,
1455
- misses,
1456
- hitCount: hits.length,
1457
- totalAspects: expectedAspects.length,
1458
- rawAspects: expectedAspects
1459
- };
1460
- }
1461
- function isErrorLike(text) {
1462
- if (!text) {
1463
- return false;
1464
- }
1465
- const lowered = text.trim().toLowerCase();
1466
- return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
1467
- }
1468
- function normalizeAspect(aspect) {
1469
- const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
1470
- return sanitized;
1471
- }
1472
- function normalizeCandidate(candidate) {
1473
- const lowered = candidate.toLowerCase();
1474
- const normalizedText = lowered.replace(/[^\w\s]/g, " ");
1475
- const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
1476
- return { normalizedText, words };
1477
- }
1478
- function matchesAspect(aspect, candidateNormalized, candidateWords) {
1479
- const keyTerms = extractKeyTerms(aspect);
1480
- if (keyTerms.length === 0) {
1481
- return false;
1482
- }
1483
- const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
1484
- const ratio = matches / keyTerms.length;
1485
- if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
1486
- return true;
1487
- }
1488
- const aspectWords = aspect.split(" ");
1489
- if (aspectWords.length >= 2) {
1490
- for (let index = 0; index < aspectWords.length - 1; index += 1) {
1491
- const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
1492
- if (candidateNormalized.includes(phrase)) {
1493
- return true;
1494
- }
1495
- }
1496
- }
1497
- return false;
1498
- }
1499
- function extractKeyTerms(aspect, maxTerms = 5) {
1500
- const terms = [];
1501
- const words = aspect.split(" ");
1502
- for (const word of words) {
1503
- if (word.length <= 2) {
1504
- continue;
1505
- }
1506
- if (STOP_WORDS.has(word)) {
1507
- continue;
1508
- }
1509
- terms.push(word);
1510
- if (terms.length >= maxTerms) {
1511
- break;
1512
- }
1513
- }
1514
- return terms;
1515
- }
1516
-
1517
- // src/evaluation/grading.ts
2388
+ // src/evaluation/evaluators.ts
1518
2389
  import { randomUUID } from "node:crypto";
1519
- var HeuristicGrader = class {
1520
- kind = "heuristic";
1521
- grade(context) {
1522
- const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1523
- const result = scoreCandidateResponse(context.candidate, expectedAspects);
1524
- const misses = [...result.misses];
1525
- if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
1526
- const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
1527
- if (firstLine && !misses.includes(firstLine)) {
1528
- misses.unshift(firstLine);
1529
- }
1530
- }
1531
- return {
1532
- score: result.score,
1533
- hits: result.hits,
1534
- misses,
1535
- expectedAspectCount: result.totalAspects,
1536
- rawAspects: result.rawAspects
1537
- };
1538
- }
1539
- };
1540
- var QualityGrader = class {
2390
+ var LlmJudgeEvaluator = class {
1541
2391
  kind = "llm_judge";
1542
2392
  resolveJudgeProvider;
1543
2393
  maxOutputTokens;
1544
2394
  temperature;
2395
+ customPrompt;
1545
2396
  constructor(options) {
1546
2397
  this.resolveJudgeProvider = options.resolveJudgeProvider;
1547
2398
  this.maxOutputTokens = options.maxOutputTokens;
1548
2399
  this.temperature = options.temperature;
2400
+ this.customPrompt = options.customPrompt;
1549
2401
  }
1550
- async grade(context) {
2402
+ async evaluate(context) {
1551
2403
  const judgeProvider = await this.resolveJudgeProvider(context);
1552
2404
  if (!judgeProvider) {
1553
2405
  throw new Error("No judge provider available for LLM grading");
1554
2406
  }
1555
2407
  const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2408
+ const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
1556
2409
  const metadata = {
1557
- systemPrompt: QUALITY_SYSTEM_PROMPT
2410
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2411
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1558
2412
  };
1559
2413
  const response = await judgeProvider.invoke({
1560
2414
  prompt,
@@ -1569,12 +2423,13 @@ var QualityGrader = class {
1569
2423
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
1570
2424
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
1571
2425
  const reasoning = parsed.reasoning ?? response.reasoning;
1572
- const graderRawRequest = {
2426
+ const evaluatorRawRequest = {
1573
2427
  id: randomUUID(),
1574
2428
  provider: judgeProvider.id,
1575
2429
  prompt,
1576
- systemPrompt: QUALITY_SYSTEM_PROMPT,
1577
- target: context.target.name
2430
+ target: context.target.name,
2431
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2432
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1578
2433
  };
1579
2434
  return {
1580
2435
  score,
@@ -1582,7 +2437,7 @@ var QualityGrader = class {
1582
2437
  misses,
1583
2438
  expectedAspectCount: hits.length + misses.length || 1,
1584
2439
  reasoning,
1585
- graderRawRequest
2440
+ evaluatorRawRequest
1586
2441
  };
1587
2442
  }
1588
2443
  };
@@ -1700,11 +2555,117 @@ function extractJsonBlob(text) {
1700
2555
  function isNonEmptyString(value) {
1701
2556
  return typeof value === "string" && value.trim().length > 0;
1702
2557
  }
2558
+ var CodeEvaluator = class {
2559
+ kind = "code";
2560
+ script;
2561
+ cwd;
2562
+ agentTimeoutMs;
2563
+ constructor(options) {
2564
+ this.script = options.script;
2565
+ this.cwd = options.cwd;
2566
+ this.agentTimeoutMs = options.agentTimeoutMs;
2567
+ }
2568
+ async evaluate(context) {
2569
+ const inputPayload = JSON.stringify(
2570
+ {
2571
+ task: context.evalCase.task,
2572
+ outcome: context.evalCase.outcome,
2573
+ expected: context.evalCase.expected_assistant_raw,
2574
+ output: context.candidate,
2575
+ system_message: context.promptInputs.systemMessage ?? "",
2576
+ guideline_paths: context.evalCase.guideline_paths,
2577
+ attachments: context.evalCase.file_paths,
2578
+ user_segments: context.evalCase.user_segments
2579
+ },
2580
+ null,
2581
+ 2
2582
+ );
2583
+ try {
2584
+ const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
2585
+ const parsed = parseJsonSafe(stdout);
2586
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
2587
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
2588
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
2589
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
2590
+ return {
2591
+ score,
2592
+ hits,
2593
+ misses,
2594
+ expectedAspectCount: hits.length + misses.length || 1,
2595
+ reasoning,
2596
+ evaluatorRawRequest: {
2597
+ script: this.script,
2598
+ ...this.cwd ? { cwd: this.cwd } : {}
2599
+ }
2600
+ };
2601
+ } catch (error) {
2602
+ const message = error instanceof Error ? error.message : String(error);
2603
+ return {
2604
+ score: 0,
2605
+ hits: [],
2606
+ misses: [`Code evaluator failed: ${message}`],
2607
+ expectedAspectCount: 1,
2608
+ reasoning: message,
2609
+ evaluatorRawRequest: {
2610
+ script: this.script,
2611
+ ...this.cwd ? { cwd: this.cwd } : {},
2612
+ error: message
2613
+ }
2614
+ };
2615
+ }
2616
+ }
2617
+ };
2618
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
2619
+ const { spawn: spawn2 } = await import("node:child_process");
2620
+ return await new Promise((resolve, reject) => {
2621
+ const child = spawn2(scriptPath, {
2622
+ shell: true,
2623
+ cwd
2624
+ });
2625
+ let stdout = "";
2626
+ let stderr = "";
2627
+ const timeout = agentTimeoutMs ? setTimeout(() => {
2628
+ child.kill();
2629
+ reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
2630
+ }, agentTimeoutMs) : void 0;
2631
+ child.stdout?.on("data", (data) => {
2632
+ stdout += data.toString();
2633
+ });
2634
+ child.stderr?.on("data", (data) => {
2635
+ stderr += data.toString();
2636
+ });
2637
+ child.on("error", (error) => {
2638
+ if (timeout !== void 0) {
2639
+ clearTimeout(timeout);
2640
+ }
2641
+ reject(error);
2642
+ });
2643
+ child.on("exit", (code) => {
2644
+ if (timeout !== void 0) {
2645
+ clearTimeout(timeout);
2646
+ }
2647
+ if (code && code !== 0 && stderr.length > 0) {
2648
+ reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
2649
+ return;
2650
+ }
2651
+ resolve(stdout.trim());
2652
+ });
2653
+ child.stdin?.write(input);
2654
+ child.stdin?.end();
2655
+ });
2656
+ }
2657
+ function parseJsonSafe(payload) {
2658
+ try {
2659
+ return JSON.parse(payload);
2660
+ } catch {
2661
+ return void 0;
2662
+ }
2663
+ }
1703
2664
 
1704
2665
  // src/evaluation/orchestrator.ts
1705
2666
  import { createHash, randomUUID as randomUUID2 } from "node:crypto";
1706
- import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
1707
- import path4 from "node:path";
2667
+ import { mkdir as mkdir2, readFile as readFile4, writeFile as writeFile2 } from "node:fs/promises";
2668
+ import path7 from "node:path";
1708
2669
 
1709
2670
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
1710
2671
  var Node = class {
@@ -1851,7 +2812,7 @@ async function runEvaluation(options) {
1851
2812
  targets,
1852
2813
  env,
1853
2814
  providerFactory,
1854
- graders,
2815
+ evaluators,
1855
2816
  maxRetries,
1856
2817
  agentTimeoutMs,
1857
2818
  promptDumpDir,
@@ -1910,7 +2871,7 @@ async function runEvaluation(options) {
1910
2871
  }
1911
2872
  return getOrCreateProvider(resolvedJudge);
1912
2873
  };
1913
- const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
2874
+ const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
1914
2875
  const primaryProvider = getOrCreateProvider(target);
1915
2876
  const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
1916
2877
  if (target.providerBatching && !providerSupportsBatch && verbose) {
@@ -1933,13 +2894,14 @@ async function runEvaluation(options) {
1933
2894
  evalCases: filteredEvalCases,
1934
2895
  provider: primaryProvider,
1935
2896
  target,
1936
- graderRegistry,
2897
+ evaluatorRegistry,
1937
2898
  promptDumpDir,
1938
2899
  nowFn: now ?? (() => /* @__PURE__ */ new Date()),
1939
2900
  onProgress,
1940
2901
  onResult,
1941
2902
  verbose,
1942
- resolveJudgeProvider
2903
+ resolveJudgeProvider,
2904
+ agentTimeoutMs
1943
2905
  });
1944
2906
  } catch (error) {
1945
2907
  if (verbose) {
@@ -1970,7 +2932,7 @@ async function runEvaluation(options) {
1970
2932
  evalCase,
1971
2933
  provider: primaryProvider,
1972
2934
  target,
1973
- graders: graderRegistry,
2935
+ evaluators: evaluatorRegistry,
1974
2936
  maxRetries,
1975
2937
  agentTimeoutMs,
1976
2938
  promptDumpDir,
@@ -2036,12 +2998,13 @@ async function runBatchEvaluation(options) {
2036
2998
  evalCases,
2037
2999
  provider,
2038
3000
  target,
2039
- graderRegistry,
3001
+ evaluatorRegistry,
2040
3002
  promptDumpDir,
2041
3003
  nowFn,
2042
3004
  onProgress,
2043
3005
  onResult,
2044
- resolveJudgeProvider
3006
+ resolveJudgeProvider,
3007
+ agentTimeoutMs
2045
3008
  } = options;
2046
3009
  const promptInputsList = [];
2047
3010
  for (const evalCase of evalCases) {
@@ -2057,7 +3020,7 @@ async function runBatchEvaluation(options) {
2057
3020
  prompt: promptInputs.request,
2058
3021
  guidelines: promptInputs.guidelines,
2059
3022
  guideline_patterns: evalCase.guideline_patterns,
2060
- attachments: evalCase.file_paths,
3023
+ inputFiles: evalCase.file_paths,
2061
3024
  evalCaseId: evalCase.id,
2062
3025
  metadata: {
2063
3026
  systemPrompt: promptInputs.systemMessage ?? ""
@@ -2089,23 +3052,19 @@ async function runBatchEvaluation(options) {
2089
3052
  const evalCase = evalCases[i];
2090
3053
  const promptInputs = promptInputsList[i];
2091
3054
  const providerResponse = batchResponse[i];
2092
- const now = nowFn();
2093
- const graderKind = evalCase.grader ?? "heuristic";
2094
- const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
2095
- if (!activeGrader) {
2096
- throw new Error(`No grader registered for kind '${graderKind}'`);
2097
- }
2098
- let grade;
3055
+ let result;
2099
3056
  try {
2100
- grade = await activeGrader.grade({
3057
+ result = await evaluateCandidate({
2101
3058
  evalCase,
2102
3059
  candidate: providerResponse.text ?? "",
2103
3060
  target,
2104
3061
  provider,
2105
- attempt: 0,
3062
+ evaluators: evaluatorRegistry,
2106
3063
  promptInputs,
2107
- now,
2108
- judgeProvider: await resolveJudgeProvider(target)
3064
+ nowFn,
3065
+ attempt: 0,
3066
+ judgeProvider: await resolveJudgeProvider(target),
3067
+ agentTimeoutMs
2109
3068
  });
2110
3069
  } catch (error) {
2111
3070
  const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
@@ -2124,28 +3083,6 @@ async function runBatchEvaluation(options) {
2124
3083
  }
2125
3084
  continue;
2126
3085
  }
2127
- const completedAt = nowFn();
2128
- const rawRequest = {
2129
- request: promptInputs.request,
2130
- guidelines: promptInputs.guidelines,
2131
- guideline_paths: evalCase.guideline_paths,
2132
- system_message: promptInputs.systemMessage ?? ""
2133
- };
2134
- const result = {
2135
- eval_id: evalCase.id,
2136
- conversation_id: evalCase.conversation_id,
2137
- score: grade.score,
2138
- hits: grade.hits,
2139
- misses: grade.misses,
2140
- model_answer: providerResponse.text ?? "",
2141
- expected_aspect_count: grade.expectedAspectCount,
2142
- target: target.name,
2143
- timestamp: completedAt.toISOString(),
2144
- reasoning: grade.reasoning,
2145
- raw_aspects: grade.rawAspects,
2146
- raw_request: rawRequest,
2147
- grader_raw_request: grade.graderRawRequest
2148
- };
2149
3086
  results.push(result);
2150
3087
  if (onResult) {
2151
3088
  await onResult(result);
@@ -2167,7 +3104,7 @@ async function runEvalCase(options) {
2167
3104
  evalCase,
2168
3105
  provider,
2169
3106
  target,
2170
- graders,
3107
+ evaluators,
2171
3108
  now,
2172
3109
  maxRetries,
2173
3110
  agentTimeoutMs,
@@ -2222,27 +3159,49 @@ async function runEvalCase(options) {
2222
3159
  if (cacheKey && cache && !cachedResponse) {
2223
3160
  await cache.set(cacheKey, providerResponse);
2224
3161
  }
2225
- const graderKind = evalCase.grader ?? "heuristic";
2226
- const activeGrader = graders[graderKind] ?? graders.heuristic;
2227
- if (!activeGrader) {
2228
- throw new Error(`No grader registered for kind '${graderKind}'`);
2229
- }
2230
- let grade;
2231
3162
  try {
2232
- const gradeTimestamp = nowFn();
2233
- grade = await activeGrader.grade({
3163
+ return await evaluateCandidate({
2234
3164
  evalCase,
2235
3165
  candidate: providerResponse.text ?? "",
2236
3166
  target,
2237
3167
  provider,
2238
- attempt,
3168
+ evaluators,
2239
3169
  promptInputs,
2240
- now: gradeTimestamp,
2241
- judgeProvider
3170
+ nowFn,
3171
+ attempt,
3172
+ judgeProvider,
3173
+ agentTimeoutMs
2242
3174
  });
2243
3175
  } catch (error) {
2244
3176
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2245
3177
  }
3178
+ }
3179
+ async function evaluateCandidate(options) {
3180
+ const {
3181
+ evalCase,
3182
+ candidate,
3183
+ target,
3184
+ provider,
3185
+ evaluators,
3186
+ promptInputs,
3187
+ nowFn,
3188
+ attempt,
3189
+ judgeProvider,
3190
+ agentTimeoutMs
3191
+ } = options;
3192
+ const gradeTimestamp = nowFn();
3193
+ const { score, evaluatorResults } = await runEvaluatorsForCase({
3194
+ evalCase,
3195
+ candidate,
3196
+ target,
3197
+ provider,
3198
+ evaluators,
3199
+ attempt,
3200
+ promptInputs,
3201
+ now: gradeTimestamp,
3202
+ judgeProvider,
3203
+ agentTimeoutMs
3204
+ });
2246
3205
  const completedAt = nowFn();
2247
3206
  const rawRequest = {
2248
3207
  request: promptInputs.request,
@@ -2253,28 +3212,200 @@ async function runEvalCase(options) {
2253
3212
  return {
2254
3213
  eval_id: evalCase.id,
2255
3214
  conversation_id: evalCase.conversation_id,
2256
- score: grade.score,
2257
- hits: grade.hits,
2258
- misses: grade.misses,
2259
- model_answer: providerResponse.text ?? "",
2260
- expected_aspect_count: grade.expectedAspectCount,
3215
+ score: score.score,
3216
+ hits: score.hits,
3217
+ misses: score.misses,
3218
+ model_answer: candidate,
3219
+ expected_aspect_count: score.expectedAspectCount,
2261
3220
  target: target.name,
2262
3221
  timestamp: completedAt.toISOString(),
2263
- reasoning: grade.reasoning,
2264
- raw_aspects: grade.rawAspects,
3222
+ reasoning: score.reasoning,
3223
+ raw_aspects: score.rawAspects,
2265
3224
  raw_request: rawRequest,
2266
- grader_raw_request: grade.graderRawRequest
3225
+ evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3226
+ evaluator_results: evaluatorResults
2267
3227
  };
2268
3228
  }
3229
+ async function runEvaluatorsForCase(options) {
3230
+ const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
3231
+ if (evalCase.evaluators && evalCase.evaluators.length > 0) {
3232
+ return runEvaluatorList({
3233
+ evalCase,
3234
+ evaluators: evalCase.evaluators,
3235
+ candidate,
3236
+ target,
3237
+ provider,
3238
+ evaluatorRegistry: evaluators,
3239
+ attempt,
3240
+ promptInputs,
3241
+ now,
3242
+ judgeProvider,
3243
+ agentTimeoutMs
3244
+ });
3245
+ }
3246
+ const evaluatorKind = evalCase.evaluator ?? "llm_judge";
3247
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
3248
+ if (!activeEvaluator) {
3249
+ throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
3250
+ }
3251
+ const score = await activeEvaluator.evaluate({
3252
+ evalCase,
3253
+ candidate,
3254
+ target,
3255
+ provider,
3256
+ attempt,
3257
+ promptInputs,
3258
+ now,
3259
+ judgeProvider
3260
+ });
3261
+ return { score };
3262
+ }
3263
+ async function runEvaluatorList(options) {
3264
+ const {
3265
+ evalCase,
3266
+ evaluators,
3267
+ candidate,
3268
+ target,
3269
+ provider,
3270
+ evaluatorRegistry,
3271
+ attempt,
3272
+ promptInputs,
3273
+ now,
3274
+ judgeProvider,
3275
+ agentTimeoutMs
3276
+ } = options;
3277
+ const scored = [];
3278
+ const evaluatorResults = [];
3279
+ for (const evaluator of evaluators ?? []) {
3280
+ try {
3281
+ if (evaluator.type === "llm_judge") {
3282
+ const score2 = await runLlmJudgeEvaluator({
3283
+ config: evaluator,
3284
+ evalCase,
3285
+ candidate,
3286
+ target,
3287
+ provider,
3288
+ evaluatorRegistry,
3289
+ attempt,
3290
+ promptInputs,
3291
+ now,
3292
+ judgeProvider
3293
+ });
3294
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3295
+ evaluatorResults.push({
3296
+ name: evaluator.name,
3297
+ type: evaluator.type,
3298
+ score: score2.score,
3299
+ hits: score2.hits,
3300
+ misses: score2.misses,
3301
+ reasoning: score2.reasoning,
3302
+ evaluator_raw_request: score2.evaluatorRawRequest
3303
+ });
3304
+ continue;
3305
+ }
3306
+ if (evaluator.type === "code") {
3307
+ const codeEvaluator = new CodeEvaluator({
3308
+ script: evaluator.script,
3309
+ cwd: evaluator.resolvedCwd ?? evaluator.cwd,
3310
+ agentTimeoutMs
3311
+ });
3312
+ const score2 = await codeEvaluator.evaluate({
3313
+ evalCase,
3314
+ candidate,
3315
+ target,
3316
+ provider,
3317
+ attempt,
3318
+ promptInputs,
3319
+ now
3320
+ });
3321
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3322
+ evaluatorResults.push({
3323
+ name: evaluator.name,
3324
+ type: evaluator.type,
3325
+ score: score2.score,
3326
+ hits: score2.hits,
3327
+ misses: score2.misses,
3328
+ reasoning: score2.reasoning,
3329
+ evaluator_raw_request: score2.evaluatorRawRequest
3330
+ });
3331
+ continue;
3332
+ }
3333
+ } catch (error) {
3334
+ const message = error instanceof Error ? error.message : String(error);
3335
+ const fallbackScore = {
3336
+ score: 0,
3337
+ hits: [],
3338
+ misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
3339
+ expectedAspectCount: 1,
3340
+ reasoning: message
3341
+ };
3342
+ scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
3343
+ evaluatorResults.push({
3344
+ name: evaluator.name ?? "unknown",
3345
+ type: evaluator.type ?? "unknown",
3346
+ score: 0,
3347
+ hits: [],
3348
+ misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
3349
+ reasoning: message
3350
+ });
3351
+ }
3352
+ }
3353
+ const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
3354
+ const hits = scored.flatMap((entry) => entry.score.hits);
3355
+ const misses = scored.flatMap((entry) => entry.score.misses);
3356
+ const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
3357
+ const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
3358
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
3359
+ const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
3360
+ const score = {
3361
+ score: aggregateScore,
3362
+ hits,
3363
+ misses,
3364
+ expectedAspectCount,
3365
+ reasoning,
3366
+ rawAspects: rawAspects.length > 0 ? rawAspects : void 0
3367
+ };
3368
+ return { score, evaluatorResults };
3369
+ }
3370
+ async function runLlmJudgeEvaluator(options) {
3371
+ const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
3372
+ const customPrompt = await resolveCustomPrompt(config);
3373
+ return evaluatorRegistry.llm_judge.evaluate({
3374
+ evalCase,
3375
+ candidate,
3376
+ target,
3377
+ provider,
3378
+ attempt,
3379
+ promptInputs,
3380
+ now,
3381
+ judgeProvider,
3382
+ systemPrompt: customPrompt,
3383
+ evaluator: config,
3384
+ judgeModel: config.model
3385
+ });
3386
+ }
3387
+ async function resolveCustomPrompt(config) {
3388
+ if (config.promptPath) {
3389
+ try {
3390
+ return await readFile4(config.promptPath, "utf8");
3391
+ } catch (error) {
3392
+ const message = error instanceof Error ? error.message : String(error);
3393
+ console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
3394
+ }
3395
+ }
3396
+ return config.prompt;
3397
+ }
3398
+ function isNonEmptyString2(value) {
3399
+ return typeof value === "string" && value.trim().length > 0;
3400
+ }
2269
3401
  function filterEvalCases(evalCases, evalId) {
2270
3402
  if (!evalId) {
2271
3403
  return evalCases;
2272
3404
  }
2273
3405
  return evalCases.filter((evalCase) => evalCase.id === evalId);
2274
3406
  }
2275
- function buildGraderRegistry(overrides, resolveJudgeProvider) {
2276
- const heuristic = overrides?.heuristic ?? new HeuristicGrader();
2277
- const llmJudge = overrides?.llm_judge ?? new QualityGrader({
3407
+ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
3408
+ const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
2278
3409
  resolveJudgeProvider: async (context) => {
2279
3410
  if (context.judgeProvider) {
2280
3411
  return context.judgeProvider;
@@ -2284,15 +3415,14 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
2284
3415
  });
2285
3416
  return {
2286
3417
  ...overrides,
2287
- heuristic,
2288
3418
  llm_judge: llmJudge
2289
3419
  };
2290
3420
  }
2291
3421
  async function dumpPrompt(directory, evalCase, promptInputs) {
2292
3422
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2293
3423
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
2294
- const filePath = path4.resolve(directory, filename);
2295
- await mkdir(path4.dirname(filePath), { recursive: true });
3424
+ const filePath = path7.resolve(directory, filename);
3425
+ await mkdir2(path7.dirname(filePath), { recursive: true });
2296
3426
  const payload = {
2297
3427
  eval_id: evalCase.id,
2298
3428
  request: promptInputs.request,
@@ -2309,7 +3439,7 @@ function sanitizeFilename(value) {
2309
3439
  return sanitized.length > 0 ? sanitized : randomUUID2();
2310
3440
  }
2311
3441
  async function invokeProvider(provider, options) {
2312
- const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
3442
+ const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
2313
3443
  const controller = new AbortController();
2314
3444
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2315
3445
  if (signal) {
@@ -2320,7 +3450,7 @@ async function invokeProvider(provider, options) {
2320
3450
  prompt: promptInputs.request,
2321
3451
  guidelines: promptInputs.guidelines,
2322
3452
  guideline_patterns: evalCase.guideline_patterns,
2323
- attachments: evalCase.file_paths,
3453
+ inputFiles: evalCase.file_paths,
2324
3454
  evalCaseId: evalCase.id,
2325
3455
  attempt,
2326
3456
  metadata: {
@@ -2388,25 +3518,20 @@ function createAgentKernel() {
2388
3518
  return { status: "stub" };
2389
3519
  }
2390
3520
  export {
2391
- GRADER_KINDS,
2392
- HeuristicGrader,
2393
- QualityGrader,
3521
+ CodeEvaluator,
3522
+ LlmJudgeEvaluator,
2394
3523
  TEST_MESSAGE_ROLES,
2395
3524
  buildDirectoryChain,
2396
3525
  buildPromptInputs,
2397
3526
  buildSearchRoots,
2398
- calculateHits,
2399
- calculateMisses,
2400
3527
  createAgentKernel,
2401
3528
  createProvider,
2402
3529
  ensureVSCodeSubagents,
2403
- extractAspects,
2404
3530
  extractCodeBlocks,
2405
3531
  fileExists,
2406
3532
  findGitRoot,
2407
3533
  getHitCount,
2408
- isErrorLike,
2409
- isGraderKind,
3534
+ isEvaluatorKind,
2410
3535
  isGuidelineFile,
2411
3536
  isJsonObject,
2412
3537
  isJsonValue,
@@ -2419,7 +3544,6 @@ export {
2419
3544
  resolveFileReference,
2420
3545
  resolveTargetDefinition,
2421
3546
  runEvalCase,
2422
- runEvaluation,
2423
- scoreCandidateResponse
3547
+ runEvaluation
2424
3548
  };
2425
3549
  //# sourceMappingURL=index.js.map