@agentv/core 0.2.11 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -30,25 +30,20 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
- GRADER_KINDS: () => GRADER_KINDS,
34
- HeuristicGrader: () => HeuristicGrader,
35
- QualityGrader: () => QualityGrader,
33
+ CodeEvaluator: () => CodeEvaluator,
34
+ LlmJudgeEvaluator: () => LlmJudgeEvaluator,
36
35
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
36
  buildDirectoryChain: () => buildDirectoryChain,
38
37
  buildPromptInputs: () => buildPromptInputs,
39
38
  buildSearchRoots: () => buildSearchRoots,
40
- calculateHits: () => calculateHits,
41
- calculateMisses: () => calculateMisses,
42
39
  createAgentKernel: () => createAgentKernel,
43
40
  createProvider: () => createProvider,
44
41
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
45
- extractAspects: () => extractAspects,
46
42
  extractCodeBlocks: () => extractCodeBlocks,
47
43
  fileExists: () => fileExists,
48
44
  findGitRoot: () => findGitRoot,
49
45
  getHitCount: () => getHitCount,
50
- isErrorLike: () => isErrorLike,
51
- isGraderKind: () => isGraderKind,
46
+ isEvaluatorKind: () => isEvaluatorKind,
52
47
  isGuidelineFile: () => isGuidelineFile,
53
48
  isJsonObject: () => isJsonObject,
54
49
  isJsonValue: () => isJsonValue,
@@ -61,8 +56,7 @@ __export(index_exports, {
61
56
  resolveFileReference: () => resolveFileReference,
62
57
  resolveTargetDefinition: () => resolveTargetDefinition,
63
58
  runEvalCase: () => runEvalCase,
64
- runEvaluation: () => runEvaluation,
65
- scoreCandidateResponse: () => scoreCandidateResponse
59
+ runEvaluation: () => runEvaluation
66
60
  });
67
61
  module.exports = __toCommonJS(index_exports);
68
62
 
@@ -107,11 +101,10 @@ function isTestMessage(value) {
107
101
  }
108
102
  return candidate.content.every(isJsonObject);
109
103
  }
110
- var GRADER_KIND_VALUES = ["heuristic", "llm_judge"];
111
- var GRADER_KINDS = GRADER_KIND_VALUES;
112
- var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
113
- function isGraderKind(value) {
114
- return typeof value === "string" && GRADER_KIND_SET.has(value);
104
+ var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
105
+ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
106
+ function isEvaluatorKind(value) {
107
+ return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
115
108
  }
116
109
  function getHitCount(result) {
117
110
  return result.hits.length;
@@ -325,7 +318,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
325
318
  if (!Array.isArray(rawTestcases)) {
326
319
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
327
320
  }
328
- const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
321
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
329
322
  const results = [];
330
323
  for (const rawEvalcase of rawTestcases) {
331
324
  if (!isJsonObject(rawEvalcase)) {
@@ -448,7 +441,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
448
441
  const assistantContent = assistantMessages[0]?.content;
449
442
  const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
450
443
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
451
- const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
444
+ const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
445
+ const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
452
446
  const userFilePaths = [];
453
447
  for (const segment of userSegments) {
454
448
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -471,7 +465,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
471
465
  file_paths: allFilePaths,
472
466
  code_snippets: codeSnippets,
473
467
  outcome,
474
- grader: testCaseGrader
468
+ evaluator: testCaseEvaluatorKind,
469
+ evaluators
475
470
  };
476
471
  if (verbose) {
477
472
  console.log(`
@@ -632,14 +627,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
632
627
  }
633
628
  return parts.join(" ");
634
629
  }
635
- function coerceGrader(candidate) {
630
+ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
631
+ const execution = rawEvalCase.execution;
632
+ const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
633
+ if (candidateEvaluators === void 0) {
634
+ return void 0;
635
+ }
636
+ if (!Array.isArray(candidateEvaluators)) {
637
+ logWarning(`Skipping evaluators for '${evalId}': expected array`);
638
+ return void 0;
639
+ }
640
+ const evaluators = [];
641
+ for (const rawEvaluator of candidateEvaluators) {
642
+ if (!isJsonObject(rawEvaluator)) {
643
+ logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
644
+ continue;
645
+ }
646
+ const name = asString(rawEvaluator.name);
647
+ const typeValue = rawEvaluator.type;
648
+ if (!name || !isEvaluatorKind(typeValue)) {
649
+ logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
650
+ continue;
651
+ }
652
+ if (typeValue === "code") {
653
+ const script = asString(rawEvaluator.script);
654
+ if (!script) {
655
+ logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
656
+ continue;
657
+ }
658
+ const cwd = asString(rawEvaluator.cwd);
659
+ let resolvedCwd;
660
+ if (cwd) {
661
+ const resolved = await resolveFileReference(cwd, searchRoots);
662
+ if (resolved.resolvedPath) {
663
+ resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
664
+ } else {
665
+ logWarning(
666
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
667
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
668
+ );
669
+ }
670
+ }
671
+ evaluators.push({
672
+ name,
673
+ type: "code",
674
+ script,
675
+ cwd,
676
+ resolvedCwd
677
+ });
678
+ continue;
679
+ }
680
+ const prompt = asString(rawEvaluator.prompt);
681
+ let promptPath;
682
+ if (prompt) {
683
+ const resolved = await resolveFileReference(prompt, searchRoots);
684
+ if (resolved.resolvedPath) {
685
+ promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
686
+ } else {
687
+ logWarning(
688
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
689
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
690
+ );
691
+ }
692
+ }
693
+ const model = asString(rawEvaluator.model);
694
+ evaluators.push({
695
+ name,
696
+ type: "llm_judge",
697
+ prompt,
698
+ promptPath,
699
+ model
700
+ });
701
+ }
702
+ return evaluators.length > 0 ? evaluators : void 0;
703
+ }
704
+ function coerceEvaluator(candidate, contextId) {
636
705
  if (typeof candidate !== "string") {
637
706
  return void 0;
638
707
  }
639
- if (isGraderKind(candidate)) {
708
+ if (isEvaluatorKind(candidate)) {
640
709
  return candidate;
641
710
  }
642
- logWarning(`Unknown grader '${candidate}', falling back to default`);
711
+ logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
643
712
  return void 0;
644
713
  }
645
714
  function logWarning(message, details) {
@@ -835,6 +904,790 @@ var GeminiProvider = class {
835
904
  }
836
905
  };
837
906
 
907
+ // src/evaluation/providers/cli.ts
908
+ var import_node_child_process = require("child_process");
909
+ var import_node_path3 = __toESM(require("path"), 1);
910
+ var import_node_util = require("util");
911
+ var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
912
+ var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
913
+ async function defaultCommandRunner(command, options) {
914
+ const execOptions = {
915
+ cwd: options.cwd,
916
+ env: options.env,
917
+ timeout: options.timeoutMs,
918
+ signal: options.signal,
919
+ maxBuffer: DEFAULT_MAX_BUFFER,
920
+ shell: process.platform === "win32" ? "powershell.exe" : void 0
921
+ };
922
+ try {
923
+ const { stdout, stderr } = await execAsync(command, execOptions);
924
+ return {
925
+ stdout,
926
+ stderr,
927
+ exitCode: 0,
928
+ failed: false,
929
+ timedOut: false,
930
+ signal: null
931
+ };
932
+ } catch (error) {
933
+ const execError = error;
934
+ return {
935
+ stdout: execError.stdout ?? "",
936
+ stderr: execError.stderr ?? "",
937
+ exitCode: typeof execError.code === "number" ? execError.code : null,
938
+ failed: true,
939
+ timedOut: execError.timedOut === true || execError.killed === true,
940
+ signal: execError.signal ?? null
941
+ };
942
+ }
943
+ }
944
+ var CliProvider = class {
945
+ id;
946
+ kind = "cli";
947
+ targetName;
948
+ supportsBatch = false;
949
+ config;
950
+ runCommand;
951
+ healthcheckPromise;
952
+ constructor(targetName, config, runner = defaultCommandRunner) {
953
+ this.targetName = targetName;
954
+ this.id = `cli:${targetName}`;
955
+ this.config = config;
956
+ this.runCommand = runner;
957
+ }
958
+ async invoke(request) {
959
+ if (request.signal?.aborted) {
960
+ throw new Error("CLI provider request was aborted before execution");
961
+ }
962
+ await this.ensureHealthy(request.signal);
963
+ const templateValues = buildTemplateValues(request, this.config);
964
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
965
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
966
+ const result = await this.runCommand(renderedCommand, {
967
+ cwd: this.config.cwd,
968
+ env,
969
+ timeoutMs: this.config.timeoutMs,
970
+ signal: request.signal
971
+ });
972
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
973
+ if (request.signal?.aborted) {
974
+ throw new Error("CLI provider request was aborted");
975
+ }
976
+ if (result.timedOut) {
977
+ throw new Error(
978
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
979
+ );
980
+ }
981
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
982
+ const detail = result.stderr.trim() || result.stdout.trim();
983
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
984
+ throw new Error(message);
985
+ }
986
+ return {
987
+ text: result.stdout,
988
+ raw: {
989
+ command: renderedCommand,
990
+ stderr: result.stderr,
991
+ exitCode: result.exitCode ?? 0,
992
+ cwd: this.config.cwd
993
+ }
994
+ };
995
+ }
996
+ async ensureHealthy(signal) {
997
+ if (!this.config.healthcheck) {
998
+ return;
999
+ }
1000
+ if (!this.healthcheckPromise) {
1001
+ this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
1002
+ }
1003
+ return this.healthcheckPromise;
1004
+ }
1005
+ async runHealthcheck(healthcheck, signal) {
1006
+ if (!healthcheck) {
1007
+ return;
1008
+ }
1009
+ const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
1010
+ if (healthcheck.type === "http") {
1011
+ const controller = new AbortController();
1012
+ const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
1013
+ signal?.addEventListener("abort", () => controller.abort(), { once: true });
1014
+ try {
1015
+ const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
1016
+ if (!response.ok) {
1017
+ throw new Error(`HTTP ${response.status} ${response.statusText}`);
1018
+ }
1019
+ } catch (error) {
1020
+ const reason = error instanceof Error ? error.message : String(error);
1021
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
1022
+ } finally {
1023
+ if (timer !== void 0) {
1024
+ clearTimeout(timer);
1025
+ }
1026
+ }
1027
+ return;
1028
+ }
1029
+ const renderedCommand = renderTemplate(
1030
+ healthcheck.commandTemplate,
1031
+ buildTemplateValues(
1032
+ {
1033
+ prompt: "",
1034
+ guidelines: "",
1035
+ inputFiles: [],
1036
+ evalCaseId: "",
1037
+ attempt: 0
1038
+ },
1039
+ this.config
1040
+ )
1041
+ );
1042
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
1043
+ const result = await this.runCommand(renderedCommand, {
1044
+ cwd: healthcheck.cwd ?? this.config.cwd,
1045
+ env,
1046
+ timeoutMs,
1047
+ signal
1048
+ });
1049
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
1050
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
1051
+ const detail = result.stderr.trim() || result.stdout.trim();
1052
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
1053
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
1054
+ }
1055
+ }
1056
+ };
1057
+ function buildTemplateValues(request, config) {
1058
+ const inputFiles = normalizeInputFiles(request.inputFiles);
1059
+ return {
1060
+ PROMPT: shellEscape(request.prompt ?? ""),
1061
+ GUIDELINES: shellEscape(request.guidelines ?? ""),
1062
+ EVAL_ID: shellEscape(request.evalCaseId ?? ""),
1063
+ ATTEMPT: shellEscape(String(request.attempt ?? 0)),
1064
+ FILES: formatFileList(inputFiles, config.filesFormat)
1065
+ };
1066
+ }
1067
+ function normalizeInputFiles(inputFiles) {
1068
+ if (!inputFiles || inputFiles.length === 0) {
1069
+ return void 0;
1070
+ }
1071
+ const unique = /* @__PURE__ */ new Map();
1072
+ for (const inputFile of inputFiles) {
1073
+ const absolutePath = import_node_path3.default.resolve(inputFile);
1074
+ if (!unique.has(absolutePath)) {
1075
+ unique.set(absolutePath, absolutePath);
1076
+ }
1077
+ }
1078
+ return Array.from(unique.values());
1079
+ }
1080
+ function formatFileList(files, template) {
1081
+ if (!files || files.length === 0) {
1082
+ return "";
1083
+ }
1084
+ const formatter = template ?? "{path}";
1085
+ return files.map((filePath) => {
1086
+ const escapedPath = shellEscape(filePath);
1087
+ const escapedName = shellEscape(import_node_path3.default.basename(filePath));
1088
+ return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
1089
+ }).join(" ");
1090
+ }
1091
+ function renderTemplate(template, values) {
1092
+ return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
1093
+ const replacement = values[key];
1094
+ return replacement !== void 0 ? replacement : match;
1095
+ });
1096
+ }
1097
+ function shellEscape(value) {
1098
+ if (value.length === 0) {
1099
+ return "''";
1100
+ }
1101
+ if (process.platform === "win32") {
1102
+ const escaped = value.replace(/"/g, '\\"');
1103
+ return `"${escaped}"`;
1104
+ }
1105
+ return `'${value.replace(/'/g, `'"'"'`)}'`;
1106
+ }
1107
+ function formatTimeoutSuffix(timeoutMs) {
1108
+ if (!timeoutMs || timeoutMs <= 0) {
1109
+ return "";
1110
+ }
1111
+ const seconds = Math.ceil(timeoutMs / 1e3);
1112
+ return ` after ${seconds}s`;
1113
+ }
1114
+
1115
+ // src/evaluation/providers/codex.ts
1116
+ var import_node_child_process2 = require("child_process");
1117
+ var import_node_fs3 = require("fs");
1118
+ var import_promises3 = require("fs/promises");
1119
+ var import_node_os = require("os");
1120
+ var import_node_path5 = __toESM(require("path"), 1);
1121
+ var import_node_util2 = require("util");
1122
+
1123
+ // src/evaluation/providers/preread.ts
1124
+ var import_node_path4 = __toESM(require("path"), 1);
1125
+ function buildPromptDocument(request, inputFiles, options) {
1126
+ const parts = [];
1127
+ const guidelineFiles = collectGuidelineFiles(
1128
+ inputFiles,
1129
+ options?.guidelinePatterns ?? request.guideline_patterns,
1130
+ options?.guidelineOverrides
1131
+ );
1132
+ const inputFilesList = collectInputFiles(inputFiles);
1133
+ const nonGuidelineInputFiles = inputFilesList.filter(
1134
+ (file) => !guidelineFiles.includes(file)
1135
+ );
1136
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
1137
+ if (prereadBlock.length > 0) {
1138
+ parts.push("\n", prereadBlock);
1139
+ }
1140
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1141
+ return parts.join("\n").trim();
1142
+ }
1143
+ function normalizeInputFiles2(inputFiles) {
1144
+ if (!inputFiles || inputFiles.length === 0) {
1145
+ return void 0;
1146
+ }
1147
+ const deduped = /* @__PURE__ */ new Map();
1148
+ for (const inputFile of inputFiles) {
1149
+ const absolutePath = import_node_path4.default.resolve(inputFile);
1150
+ if (!deduped.has(absolutePath)) {
1151
+ deduped.set(absolutePath, absolutePath);
1152
+ }
1153
+ }
1154
+ return Array.from(deduped.values());
1155
+ }
1156
+ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
1157
+ if (!inputFiles || inputFiles.length === 0) {
1158
+ return [];
1159
+ }
1160
+ const unique = /* @__PURE__ */ new Map();
1161
+ for (const inputFile of inputFiles) {
1162
+ const absolutePath = import_node_path4.default.resolve(inputFile);
1163
+ if (overrides?.has(absolutePath)) {
1164
+ if (!unique.has(absolutePath)) {
1165
+ unique.set(absolutePath, absolutePath);
1166
+ }
1167
+ continue;
1168
+ }
1169
+ const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
1170
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1171
+ if (!unique.has(absolutePath)) {
1172
+ unique.set(absolutePath, absolutePath);
1173
+ }
1174
+ }
1175
+ }
1176
+ return Array.from(unique.values());
1177
+ }
1178
+ function collectInputFiles(inputFiles) {
1179
+ if (!inputFiles || inputFiles.length === 0) {
1180
+ return [];
1181
+ }
1182
+ const unique = /* @__PURE__ */ new Map();
1183
+ for (const inputFile of inputFiles) {
1184
+ const absolutePath = import_node_path4.default.resolve(inputFile);
1185
+ if (!unique.has(absolutePath)) {
1186
+ unique.set(absolutePath, absolutePath);
1187
+ }
1188
+ }
1189
+ return Array.from(unique.values());
1190
+ }
1191
+ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
1192
+ if (guidelineFiles.length === 0 && inputFiles.length === 0) {
1193
+ return "";
1194
+ }
1195
+ const buildList = (files) => files.map((absolutePath) => {
1196
+ const fileName = import_node_path4.default.basename(absolutePath);
1197
+ const fileUri = pathToFileUri(absolutePath);
1198
+ return `* [${fileName}](${fileUri})`;
1199
+ });
1200
+ const sections = [];
1201
+ if (guidelineFiles.length > 0) {
1202
+ sections.push(`Read all guideline files:
1203
+ ${buildList(guidelineFiles).join("\n")}.`);
1204
+ }
1205
+ if (inputFiles.length > 0) {
1206
+ sections.push(`Read all input files:
1207
+ ${buildList(inputFiles).join("\n")}.`);
1208
+ }
1209
+ sections.push(
1210
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1211
+ "Then apply system_instructions on the user query below."
1212
+ );
1213
+ return sections.join("\n");
1214
+ }
1215
+ function pathToFileUri(filePath) {
1216
+ const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
1217
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
1218
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1219
+ return `file:///${normalizedPath}`;
1220
+ }
1221
+ return `file://${normalizedPath}`;
1222
+ }
1223
+
1224
+ // src/evaluation/providers/codex.ts
1225
+ var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
1226
+ var WORKSPACE_PREFIX = "agentv-codex-";
1227
+ var PROMPT_FILENAME = "prompt.md";
1228
+ var FILES_DIR = "files";
1229
+ var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
1230
+ var CodexProvider = class {
1231
+ id;
1232
+ kind = "codex";
1233
+ targetName;
1234
+ supportsBatch = false;
1235
+ config;
1236
+ runCodex;
1237
+ environmentCheck;
1238
+ resolvedExecutable;
1239
+ constructor(targetName, config, runner = defaultCodexRunner) {
1240
+ this.id = `codex:${targetName}`;
1241
+ this.targetName = targetName;
1242
+ this.config = config;
1243
+ this.runCodex = runner;
1244
+ }
1245
+ async invoke(request) {
1246
+ if (request.signal?.aborted) {
1247
+ throw new Error("Codex provider request was aborted before execution");
1248
+ }
1249
+ await this.ensureEnvironmentReady();
1250
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
1251
+ const originalGuidelines = new Set(
1252
+ collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => import_node_path5.default.resolve(file))
1253
+ );
1254
+ const workspaceRoot = await this.createWorkspace();
1255
+ try {
1256
+ const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
1257
+ inputFiles,
1258
+ workspaceRoot,
1259
+ originalGuidelines
1260
+ );
1261
+ const promptContent = buildPromptDocument(request, mirroredInputFiles, {
1262
+ guidelinePatterns: request.guideline_patterns,
1263
+ guidelineOverrides: guidelineMirrors
1264
+ });
1265
+ const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
1266
+ await (0, import_promises3.writeFile)(promptFile, promptContent, "utf8");
1267
+ const args = this.buildCodexArgs();
1268
+ const cwd = this.resolveCwd(workspaceRoot);
1269
+ const result = await this.executeCodex(args, cwd, promptContent, request.signal);
1270
+ if (result.timedOut) {
1271
+ throw new Error(
1272
+ `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
1273
+ );
1274
+ }
1275
+ if (result.exitCode !== 0) {
1276
+ const detail = pickDetail(result.stderr, result.stdout);
1277
+ const prefix = `Codex CLI exited with code ${result.exitCode}`;
1278
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
1279
+ }
1280
+ const parsed = parseCodexJson(result.stdout);
1281
+ const assistantText = extractAssistantText(parsed);
1282
+ return {
1283
+ text: assistantText,
1284
+ raw: {
1285
+ response: parsed,
1286
+ stdout: result.stdout,
1287
+ stderr: result.stderr,
1288
+ exitCode: result.exitCode,
1289
+ args,
1290
+ executable: this.resolvedExecutable ?? this.config.executable,
1291
+ promptFile,
1292
+ workspace: workspaceRoot,
1293
+ inputFiles: mirroredInputFiles
1294
+ }
1295
+ };
1296
+ } finally {
1297
+ await this.cleanupWorkspace(workspaceRoot);
1298
+ }
1299
+ }
1300
+ async ensureEnvironmentReady() {
1301
+ if (!this.environmentCheck) {
1302
+ this.environmentCheck = this.validateEnvironment();
1303
+ }
1304
+ await this.environmentCheck;
1305
+ }
1306
+ async validateEnvironment() {
1307
+ this.resolvedExecutable = await locateExecutable(this.config.executable);
1308
+ }
1309
+ resolveCwd(workspaceRoot) {
1310
+ if (!this.config.cwd) {
1311
+ return workspaceRoot;
1312
+ }
1313
+ return import_node_path5.default.resolve(this.config.cwd);
1314
+ }
1315
+ buildCodexArgs() {
1316
+ const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
1317
+ if (this.config.args && this.config.args.length > 0) {
1318
+ args.push(...this.config.args);
1319
+ }
1320
+ args.push("-");
1321
+ return args;
1322
+ }
1323
+ async executeCodex(args, cwd, promptContent, signal) {
1324
+ try {
1325
+ return await this.runCodex({
1326
+ executable: this.resolvedExecutable ?? this.config.executable,
1327
+ args,
1328
+ cwd,
1329
+ prompt: promptContent,
1330
+ timeoutMs: this.config.timeoutMs,
1331
+ env: process.env,
1332
+ signal
1333
+ });
1334
+ } catch (error) {
1335
+ const err = error;
1336
+ if (err.code === "ENOENT") {
1337
+ throw new Error(
1338
+ `Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
1339
+ );
1340
+ }
1341
+ throw error;
1342
+ }
1343
+ }
1344
+ async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
1345
+ if (!inputFiles || inputFiles.length === 0) {
1346
+ return {
1347
+ mirroredInputFiles: void 0,
1348
+ guidelineMirrors: /* @__PURE__ */ new Set()
1349
+ };
1350
+ }
1351
+ const filesRoot = import_node_path5.default.join(workspaceRoot, FILES_DIR);
1352
+ await (0, import_promises3.mkdir)(filesRoot, { recursive: true });
1353
+ const mirrored = [];
1354
+ const guidelineMirrors = /* @__PURE__ */ new Set();
1355
+ const nameCounts = /* @__PURE__ */ new Map();
1356
+ for (const inputFile of inputFiles) {
1357
+ const absoluteSource = import_node_path5.default.resolve(inputFile);
1358
+ const baseName = import_node_path5.default.basename(absoluteSource);
1359
+ const count = nameCounts.get(baseName) ?? 0;
1360
+ nameCounts.set(baseName, count + 1);
1361
+ const finalName = count === 0 ? baseName : `${baseName}.${count}`;
1362
+ const destination = import_node_path5.default.join(filesRoot, finalName);
1363
+ await (0, import_promises3.copyFile)(absoluteSource, destination);
1364
+ const resolvedDestination = import_node_path5.default.resolve(destination);
1365
+ mirrored.push(resolvedDestination);
1366
+ if (guidelineOriginals.has(absoluteSource)) {
1367
+ guidelineMirrors.add(resolvedDestination);
1368
+ }
1369
+ }
1370
+ return {
1371
+ mirroredInputFiles: mirrored,
1372
+ guidelineMirrors
1373
+ };
1374
+ }
1375
+ async createWorkspace() {
1376
+ return await (0, import_promises3.mkdtemp)(import_node_path5.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
1377
+ }
1378
+ async cleanupWorkspace(workspaceRoot) {
1379
+ try {
1380
+ await (0, import_promises3.rm)(workspaceRoot, { recursive: true, force: true });
1381
+ } catch {
1382
+ }
1383
+ }
1384
+ };
1385
+ async function locateExecutable(candidate) {
1386
+ const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
1387
+ if (includesPathSeparator) {
1388
+ const resolved = import_node_path5.default.isAbsolute(candidate) ? candidate : import_node_path5.default.resolve(candidate);
1389
+ const executablePath = await ensureWindowsExecutableVariant(resolved);
1390
+ await (0, import_promises3.access)(executablePath, import_node_fs3.constants.F_OK);
1391
+ return executablePath;
1392
+ }
1393
+ const locator = process.platform === "win32" ? "where" : "which";
1394
+ try {
1395
+ const { stdout } = await execAsync2(`${locator} ${candidate}`);
1396
+ const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
1397
+ const preferred = selectExecutableCandidate(lines);
1398
+ if (preferred) {
1399
+ const executablePath = await ensureWindowsExecutableVariant(preferred);
1400
+ await (0, import_promises3.access)(executablePath, import_node_fs3.constants.F_OK);
1401
+ return executablePath;
1402
+ }
1403
+ } catch {
1404
+ }
1405
+ throw new Error(`Codex executable '${candidate}' was not found on PATH`);
1406
+ }
1407
+ function selectExecutableCandidate(candidates) {
1408
+ if (candidates.length === 0) {
1409
+ return void 0;
1410
+ }
1411
+ if (process.platform !== "win32") {
1412
+ return candidates[0];
1413
+ }
1414
+ const extensions = getWindowsExecutableExtensions();
1415
+ for (const ext of extensions) {
1416
+ const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
1417
+ if (match) {
1418
+ return match;
1419
+ }
1420
+ }
1421
+ return candidates[0];
1422
+ }
1423
+ async function ensureWindowsExecutableVariant(candidate) {
1424
+ if (process.platform !== "win32") {
1425
+ return candidate;
1426
+ }
1427
+ if (hasExecutableExtension(candidate)) {
1428
+ return candidate;
1429
+ }
1430
+ const extensions = getWindowsExecutableExtensions();
1431
+ for (const ext of extensions) {
1432
+ const withExtension = `${candidate}${ext}`;
1433
+ try {
1434
+ await (0, import_promises3.access)(withExtension, import_node_fs3.constants.F_OK);
1435
+ return withExtension;
1436
+ } catch {
1437
+ }
1438
+ }
1439
+ return candidate;
1440
+ }
1441
+ function hasExecutableExtension(candidate) {
1442
+ const lower = candidate.toLowerCase();
1443
+ return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
1444
+ }
1445
+ var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
1446
+ function getWindowsExecutableExtensions() {
1447
+ if (process.platform !== "win32") {
1448
+ return [];
1449
+ }
1450
+ const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
1451
+ return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
1452
+ }
1453
+ function parseCodexJson(output) {
1454
+ const trimmed = output.trim();
1455
+ if (trimmed.length === 0) {
1456
+ throw new Error("Codex CLI produced no output in --json mode");
1457
+ }
1458
+ try {
1459
+ return JSON.parse(trimmed);
1460
+ } catch {
1461
+ const lineObjects = parseJsonLines(trimmed);
1462
+ if (lineObjects) {
1463
+ return lineObjects;
1464
+ }
1465
+ const lastBrace = trimmed.lastIndexOf("{");
1466
+ if (lastBrace >= 0) {
1467
+ const candidate = trimmed.slice(lastBrace);
1468
+ try {
1469
+ return JSON.parse(candidate);
1470
+ } catch {
1471
+ }
1472
+ }
1473
+ const preview = trimmed.slice(0, 200);
1474
+ throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
1475
+ }
1476
+ }
1477
+ function extractAssistantText(parsed) {
1478
+ if (Array.isArray(parsed)) {
1479
+ const text = extractFromEventStream(parsed);
1480
+ if (text) {
1481
+ return text;
1482
+ }
1483
+ }
1484
+ if (!parsed || typeof parsed !== "object") {
1485
+ throw new Error("Codex CLI JSON response did not include an assistant message");
1486
+ }
1487
+ const record = parsed;
1488
+ const eventText = extractFromEvent(record);
1489
+ if (eventText) {
1490
+ return eventText;
1491
+ }
1492
+ const messages = Array.isArray(record.messages) ? record.messages : void 0;
1493
+ if (messages) {
1494
+ for (let index = messages.length - 1; index >= 0; index -= 1) {
1495
+ const entry = messages[index];
1496
+ if (!entry || typeof entry !== "object") {
1497
+ continue;
1498
+ }
1499
+ const role = entry.role;
1500
+ if (role !== "assistant") {
1501
+ continue;
1502
+ }
1503
+ const content = entry.content;
1504
+ const flattened = flattenContent(content);
1505
+ if (flattened) {
1506
+ return flattened;
1507
+ }
1508
+ }
1509
+ }
1510
+ const response = record.response;
1511
+ if (response && typeof response === "object") {
1512
+ const content = response.content;
1513
+ const flattened = flattenContent(content);
1514
+ if (flattened) {
1515
+ return flattened;
1516
+ }
1517
+ }
1518
+ const output = record.output;
1519
+ const flattenedOutput = flattenContent(output);
1520
+ if (flattenedOutput) {
1521
+ return flattenedOutput;
1522
+ }
1523
+ throw new Error("Codex CLI JSON response did not include an assistant message");
1524
+ }
1525
+ function extractFromEventStream(events) {
1526
+ for (let index = events.length - 1; index >= 0; index -= 1) {
1527
+ const candidate = events[index];
1528
+ const text = extractFromEvent(candidate);
1529
+ if (text) {
1530
+ return text;
1531
+ }
1532
+ }
1533
+ return void 0;
1534
+ }
1535
+ function extractFromEvent(event) {
1536
+ if (!event || typeof event !== "object") {
1537
+ return void 0;
1538
+ }
1539
+ const record = event;
1540
+ const type = typeof record.type === "string" ? record.type : void 0;
1541
+ if (type === JSONL_TYPE_ITEM_COMPLETED) {
1542
+ const item = record.item;
1543
+ const text = extractFromItem(item);
1544
+ if (text) {
1545
+ return text;
1546
+ }
1547
+ }
1548
+ const output = record.output ?? record.content;
1549
+ const flattened = flattenContent(output);
1550
+ if (flattened) {
1551
+ return flattened;
1552
+ }
1553
+ return void 0;
1554
+ }
1555
+ function extractFromItem(item) {
1556
+ if (!item || typeof item !== "object") {
1557
+ return void 0;
1558
+ }
1559
+ const record = item;
1560
+ const itemType = typeof record.type === "string" ? record.type : void 0;
1561
+ if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
1562
+ const text = flattenContent(record.text ?? record.content ?? record.output);
1563
+ if (text) {
1564
+ return text;
1565
+ }
1566
+ }
1567
+ return void 0;
1568
+ }
1569
+ function flattenContent(value) {
1570
+ if (typeof value === "string") {
1571
+ return value;
1572
+ }
1573
+ if (Array.isArray(value)) {
1574
+ const parts = value.map((segment) => {
1575
+ if (typeof segment === "string") {
1576
+ return segment;
1577
+ }
1578
+ if (segment && typeof segment === "object" && "text" in segment) {
1579
+ const text = segment.text;
1580
+ return typeof text === "string" ? text : void 0;
1581
+ }
1582
+ return void 0;
1583
+ }).filter((part) => typeof part === "string" && part.length > 0);
1584
+ return parts.length > 0 ? parts.join(" \n") : void 0;
1585
+ }
1586
+ if (value && typeof value === "object" && "text" in value) {
1587
+ const text = value.text;
1588
+ return typeof text === "string" ? text : void 0;
1589
+ }
1590
+ return void 0;
1591
+ }
1592
+ function parseJsonLines(output) {
1593
+ const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
1594
+ if (lines.length <= 1) {
1595
+ return void 0;
1596
+ }
1597
+ const parsed = [];
1598
+ for (const line of lines) {
1599
+ try {
1600
+ parsed.push(JSON.parse(line));
1601
+ } catch {
1602
+ return void 0;
1603
+ }
1604
+ }
1605
+ return parsed;
1606
+ }
1607
+ function pickDetail(stderr, stdout) {
1608
+ const errorText = stderr.trim();
1609
+ if (errorText.length > 0) {
1610
+ return errorText;
1611
+ }
1612
+ const stdoutText = stdout.trim();
1613
+ return stdoutText.length > 0 ? stdoutText : void 0;
1614
+ }
1615
+ function formatTimeoutSuffix2(timeoutMs) {
1616
+ if (!timeoutMs || timeoutMs <= 0) {
1617
+ return "";
1618
+ }
1619
+ const seconds = Math.ceil(timeoutMs / 1e3);
1620
+ return ` after ${seconds}s`;
1621
+ }
1622
+ async function defaultCodexRunner(options) {
1623
+ return await new Promise((resolve, reject) => {
1624
+ const child = (0, import_node_child_process2.spawn)(options.executable, options.args, {
1625
+ cwd: options.cwd,
1626
+ env: options.env,
1627
+ stdio: ["pipe", "pipe", "pipe"],
1628
+ shell: shouldShellExecute(options.executable)
1629
+ });
1630
+ let stdout = "";
1631
+ let stderr = "";
1632
+ let timedOut = false;
1633
+ const onAbort = () => {
1634
+ child.kill("SIGTERM");
1635
+ };
1636
+ if (options.signal) {
1637
+ if (options.signal.aborted) {
1638
+ onAbort();
1639
+ } else {
1640
+ options.signal.addEventListener("abort", onAbort, { once: true });
1641
+ }
1642
+ }
1643
+ let timeoutHandle;
1644
+ if (options.timeoutMs && options.timeoutMs > 0) {
1645
+ timeoutHandle = setTimeout(() => {
1646
+ timedOut = true;
1647
+ child.kill("SIGTERM");
1648
+ }, options.timeoutMs);
1649
+ timeoutHandle.unref?.();
1650
+ }
1651
+ child.stdout.setEncoding("utf8");
1652
+ child.stdout.on("data", (chunk) => {
1653
+ stdout += chunk;
1654
+ });
1655
+ child.stderr.setEncoding("utf8");
1656
+ child.stderr.on("data", (chunk) => {
1657
+ stderr += chunk;
1658
+ });
1659
+ child.stdin.end(options.prompt);
1660
+ const cleanup = () => {
1661
+ if (timeoutHandle) {
1662
+ clearTimeout(timeoutHandle);
1663
+ }
1664
+ if (options.signal) {
1665
+ options.signal.removeEventListener("abort", onAbort);
1666
+ }
1667
+ };
1668
+ child.on("error", (error) => {
1669
+ cleanup();
1670
+ reject(error);
1671
+ });
1672
+ child.on("close", (code) => {
1673
+ cleanup();
1674
+ resolve({
1675
+ stdout,
1676
+ stderr,
1677
+ exitCode: typeof code === "number" ? code : -1,
1678
+ timedOut
1679
+ });
1680
+ });
1681
+ });
1682
+ }
1683
+ function shouldShellExecute(executable) {
1684
+ if (process.platform !== "win32") {
1685
+ return false;
1686
+ }
1687
+ const lower = executable.toLowerCase();
1688
+ return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
1689
+ }
1690
+
838
1691
  // src/evaluation/providers/mock.ts
839
1692
  var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
840
1693
  var MockProvider = class {
@@ -878,6 +1731,7 @@ var MockProvider = class {
878
1731
 
879
1732
  // src/evaluation/providers/targets.ts
880
1733
  var import_zod = require("zod");
1734
+ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
881
1735
  var BASE_TARGET_SCHEMA = import_zod.z.object({
882
1736
  name: import_zod.z.string().min(1, "target name is required"),
883
1737
  provider: import_zod.z.string().min(1, "provider is required"),
@@ -934,6 +1788,16 @@ function resolveTargetDefinition(definition, env = process.env) {
934
1788
  providerBatching,
935
1789
  config: resolveGeminiConfig(parsed, env)
936
1790
  };
1791
+ case "codex":
1792
+ case "codex-cli":
1793
+ return {
1794
+ kind: "codex",
1795
+ name: parsed.name,
1796
+ judgeTarget: parsed.judge_target,
1797
+ workers: parsed.workers,
1798
+ providerBatching,
1799
+ config: resolveCodexConfig(parsed, env)
1800
+ };
937
1801
  case "mock":
938
1802
  return {
939
1803
  kind: "mock",
@@ -951,7 +1815,16 @@ function resolveTargetDefinition(definition, env = process.env) {
951
1815
  judgeTarget: parsed.judge_target,
952
1816
  workers: parsed.workers,
953
1817
  providerBatching,
954
- config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
1818
+ config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
1819
+ };
1820
+ case "cli":
1821
+ return {
1822
+ kind: "cli",
1823
+ name: parsed.name,
1824
+ judgeTarget: parsed.judge_target,
1825
+ workers: parsed.workers,
1826
+ providerBatching,
1827
+ config: resolveCliConfig(parsed, env)
955
1828
  };
956
1829
  default:
957
1830
  throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
@@ -1020,6 +1893,29 @@ function resolveGeminiConfig(target, env) {
1020
1893
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
1021
1894
  };
1022
1895
  }
1896
+ function resolveCodexConfig(target, env) {
1897
+ const settings = target.settings ?? {};
1898
+ const executableSource = settings.executable ?? settings.command ?? settings.binary;
1899
+ const argsSource = settings.args ?? settings.arguments;
1900
+ const cwdSource = settings.cwd;
1901
+ const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
1902
+ const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
1903
+ allowLiteral: true,
1904
+ optionalEnv: true
1905
+ }) ?? "codex";
1906
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
1907
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
1908
+ allowLiteral: true,
1909
+ optionalEnv: true
1910
+ });
1911
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
1912
+ return {
1913
+ executable,
1914
+ args,
1915
+ cwd,
1916
+ timeoutMs
1917
+ };
1918
+ }
1023
1919
  function resolveMockConfig(target) {
1024
1920
  const settings = target.settings ?? {};
1025
1921
  const response = typeof settings.response === "string" ? settings.response : void 0;
@@ -1049,6 +1945,125 @@ function resolveVSCodeConfig(target, env, insiders) {
1049
1945
  workspaceTemplate
1050
1946
  };
1051
1947
  }
1948
+ function resolveCliConfig(target, env) {
1949
+ const settings = target.settings ?? {};
1950
+ const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
1951
+ const filesFormat = resolveOptionalLiteralString(
1952
+ settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
1953
+ );
1954
+ const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
1955
+ allowLiteral: true,
1956
+ optionalEnv: true
1957
+ });
1958
+ const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
1959
+ const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
1960
+ const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
1961
+ const commandTemplate = resolveString(
1962
+ commandTemplateSource,
1963
+ env,
1964
+ `${target.name} CLI command template`,
1965
+ true
1966
+ );
1967
+ assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
1968
+ return {
1969
+ commandTemplate,
1970
+ filesFormat,
1971
+ cwd,
1972
+ env: envOverrides,
1973
+ timeoutMs,
1974
+ healthcheck
1975
+ };
1976
+ }
1977
+ function resolveEnvOverrides(source, env, targetName) {
1978
+ if (source === void 0 || source === null) {
1979
+ return void 0;
1980
+ }
1981
+ if (typeof source !== "object" || Array.isArray(source)) {
1982
+ throw new Error(`${targetName} env overrides must be an object map of strings`);
1983
+ }
1984
+ const entries = Object.entries(source);
1985
+ const resolved = {};
1986
+ for (const [key, value] of entries) {
1987
+ if (typeof value !== "string") {
1988
+ throw new Error(`${targetName} env override '${key}' must be a string`);
1989
+ }
1990
+ const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
1991
+ resolved[key] = resolvedValue;
1992
+ }
1993
+ return Object.keys(resolved).length > 0 ? resolved : void 0;
1994
+ }
1995
+ function resolveTimeoutMs(source, description) {
1996
+ const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
1997
+ if (seconds === void 0) {
1998
+ return void 0;
1999
+ }
2000
+ if (seconds <= 0) {
2001
+ throw new Error(`${description} must be greater than zero seconds`);
2002
+ }
2003
+ return Math.floor(seconds * 1e3);
2004
+ }
2005
+ function resolveCliHealthcheck(source, env, targetName) {
2006
+ if (source === void 0 || source === null) {
2007
+ return void 0;
2008
+ }
2009
+ if (typeof source !== "object" || Array.isArray(source)) {
2010
+ throw new Error(`${targetName} healthcheck must be an object`);
2011
+ }
2012
+ const candidate = source;
2013
+ const type = candidate.type;
2014
+ const timeoutMs = resolveTimeoutMs(
2015
+ candidate.timeout_seconds ?? candidate.timeoutSeconds,
2016
+ `${targetName} healthcheck timeout`
2017
+ );
2018
+ if (type === "http") {
2019
+ const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
2020
+ return {
2021
+ type: "http",
2022
+ url,
2023
+ timeoutMs
2024
+ };
2025
+ }
2026
+ if (type === "command") {
2027
+ const commandTemplate = resolveString(
2028
+ candidate.command_template ?? candidate.commandTemplate,
2029
+ env,
2030
+ `${targetName} healthcheck command template`,
2031
+ true
2032
+ );
2033
+ assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
2034
+ const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
2035
+ allowLiteral: true,
2036
+ optionalEnv: true
2037
+ });
2038
+ return {
2039
+ type: "command",
2040
+ commandTemplate,
2041
+ timeoutMs,
2042
+ cwd
2043
+ };
2044
+ }
2045
+ throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
2046
+ }
2047
+ function assertSupportedCliPlaceholders(template, description) {
2048
+ const placeholders = extractCliPlaceholders(template);
2049
+ for (const placeholder of placeholders) {
2050
+ if (!CLI_PLACEHOLDERS.has(placeholder)) {
2051
+ throw new Error(
2052
+ `${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
2053
+ );
2054
+ }
2055
+ }
2056
+ }
2057
+ function extractCliPlaceholders(template) {
2058
+ const matches = template.matchAll(/\{([A-Z_]+)\}/g);
2059
+ const results = [];
2060
+ for (const match of matches) {
2061
+ if (match[1]) {
2062
+ results.push(match[1]);
2063
+ }
2064
+ }
2065
+ return results;
2066
+ }
1052
2067
  function resolveString(source, env, description, allowLiteral = false) {
1053
2068
  const value = resolveOptionalString(source, env, description, {
1054
2069
  allowLiteral,
@@ -1079,11 +2094,14 @@ function resolveOptionalString(source, env, description, options) {
1079
2094
  }
1080
2095
  const allowLiteral = options?.allowLiteral ?? false;
1081
2096
  const optionalEnv = options?.optionalEnv ?? false;
1082
- if (!allowLiteral && isLikelyEnvReference(trimmed)) {
2097
+ const looksLikeEnv = isLikelyEnvReference(trimmed);
2098
+ if (looksLikeEnv) {
1083
2099
  if (optionalEnv) {
1084
2100
  return void 0;
1085
2101
  }
1086
- throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
2102
+ if (!allowLiteral) {
2103
+ throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
2104
+ }
1087
2105
  }
1088
2106
  return trimmed;
1089
2107
  }
@@ -1133,10 +2151,42 @@ function resolveOptionalBoolean(source) {
1133
2151
  function isLikelyEnvReference(value) {
1134
2152
  return /^[A-Z0-9_]+$/.test(value);
1135
2153
  }
2154
+ function resolveOptionalStringArray(source, env, description) {
2155
+ if (source === void 0 || source === null) {
2156
+ return void 0;
2157
+ }
2158
+ if (!Array.isArray(source)) {
2159
+ throw new Error(`${description} must be an array of strings`);
2160
+ }
2161
+ if (source.length === 0) {
2162
+ return void 0;
2163
+ }
2164
+ const resolved = [];
2165
+ for (let i = 0; i < source.length; i++) {
2166
+ const item = source[i];
2167
+ if (typeof item !== "string") {
2168
+ throw new Error(`${description}[${i}] must be a string`);
2169
+ }
2170
+ const trimmed = item.trim();
2171
+ if (trimmed.length === 0) {
2172
+ throw new Error(`${description}[${i}] cannot be empty`);
2173
+ }
2174
+ const envValue = env[trimmed];
2175
+ if (envValue !== void 0) {
2176
+ if (envValue.trim().length === 0) {
2177
+ throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
2178
+ }
2179
+ resolved.push(envValue);
2180
+ } else {
2181
+ resolved.push(trimmed);
2182
+ }
2183
+ }
2184
+ return resolved.length > 0 ? resolved : void 0;
2185
+ }
1136
2186
 
1137
2187
  // src/evaluation/providers/vscode.ts
1138
- var import_promises3 = require("fs/promises");
1139
- var import_node_path3 = __toESM(require("path"), 1);
2188
+ var import_promises4 = require("fs/promises");
2189
+ var import_node_path6 = __toESM(require("path"), 1);
1140
2190
  var import_subagent = require("subagent");
1141
2191
  var VSCodeProvider = class {
1142
2192
  id;
@@ -1154,12 +2204,11 @@ var VSCodeProvider = class {
1154
2204
  if (request.signal?.aborted) {
1155
2205
  throw new Error("VS Code provider request was aborted before dispatch");
1156
2206
  }
1157
- const attachments = normalizeAttachments(request.attachments);
1158
- const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
2207
+ const inputFiles = normalizeAttachments(request.inputFiles);
2208
+ const promptContent = buildPromptDocument2(request, inputFiles, request.guideline_patterns);
1159
2209
  const session = await (0, import_subagent.dispatchAgentSession)({
1160
2210
  userQuery: promptContent,
1161
- // Use full prompt content instead of just request.prompt
1162
- extraAttachments: attachments,
2211
+ extraAttachments: inputFiles,
1163
2212
  wait: this.config.waitForResponse,
1164
2213
  dryRun: this.config.dryRun,
1165
2214
  vscodeCmd: this.config.command,
@@ -1176,16 +2225,16 @@ var VSCodeProvider = class {
1176
2225
  text: "",
1177
2226
  raw: {
1178
2227
  session,
1179
- attachments
2228
+ inputFiles
1180
2229
  }
1181
2230
  };
1182
2231
  }
1183
- const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
2232
+ const responseText = await (0, import_promises4.readFile)(session.responseFile, "utf8");
1184
2233
  return {
1185
2234
  text: responseText,
1186
2235
  raw: {
1187
2236
  session,
1188
- attachments
2237
+ inputFiles
1189
2238
  }
1190
2239
  };
1191
2240
  }
@@ -1195,17 +2244,17 @@ var VSCodeProvider = class {
1195
2244
  }
1196
2245
  const normalizedRequests = requests.map((req) => ({
1197
2246
  request: req,
1198
- attachments: normalizeAttachments(req.attachments)
2247
+ inputFiles: normalizeAttachments(req.inputFiles)
1199
2248
  }));
1200
- const combinedAttachments = mergeAttachments(
1201
- normalizedRequests.map(({ attachments }) => attachments)
2249
+ const combinedInputFiles = mergeAttachments(
2250
+ normalizedRequests.map(({ inputFiles }) => inputFiles)
1202
2251
  );
1203
2252
  const userQueries = normalizedRequests.map(
1204
- ({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
2253
+ ({ request, inputFiles }) => buildPromptDocument2(request, inputFiles, request.guideline_patterns)
1205
2254
  );
1206
2255
  const session = await (0, import_subagent.dispatchBatchAgent)({
1207
2256
  userQueries,
1208
- extraAttachments: combinedAttachments,
2257
+ extraAttachments: combinedInputFiles,
1209
2258
  wait: this.config.waitForResponse,
1210
2259
  dryRun: this.config.dryRun,
1211
2260
  vscodeCmd: this.config.command,
@@ -1218,12 +2267,12 @@ var VSCodeProvider = class {
1218
2267
  throw new Error(failure);
1219
2268
  }
1220
2269
  if (this.config.dryRun) {
1221
- return normalizedRequests.map(({ attachments }) => ({
2270
+ return normalizedRequests.map(({ inputFiles }) => ({
1222
2271
  text: "",
1223
2272
  raw: {
1224
2273
  session,
1225
- attachments,
1226
- allAttachments: combinedAttachments
2274
+ inputFiles,
2275
+ allInputFiles: combinedInputFiles
1227
2276
  }
1228
2277
  }));
1229
2278
  }
@@ -1234,13 +2283,13 @@ var VSCodeProvider = class {
1234
2283
  }
1235
2284
  const responses = [];
1236
2285
  for (const [index, responseFile] of session.responseFiles.entries()) {
1237
- const responseText = await (0, import_promises3.readFile)(responseFile, "utf8");
2286
+ const responseText = await (0, import_promises4.readFile)(responseFile, "utf8");
1238
2287
  responses.push({
1239
2288
  text: responseText,
1240
2289
  raw: {
1241
2290
  session,
1242
- attachments: normalizedRequests[index]?.attachments,
1243
- allAttachments: combinedAttachments,
2291
+ inputFiles: normalizedRequests[index]?.inputFiles,
2292
+ allInputFiles: combinedInputFiles,
1244
2293
  responseFile
1245
2294
  }
1246
2295
  });
@@ -1248,27 +2297,27 @@ var VSCodeProvider = class {
1248
2297
  return responses;
1249
2298
  }
1250
2299
  };
1251
- function buildPromptDocument(request, attachments, guidelinePatterns) {
2300
+ function buildPromptDocument2(request, attachments, guidelinePatterns) {
1252
2301
  const parts = [];
1253
- const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
2302
+ const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
1254
2303
  const attachmentFiles = collectAttachmentFiles(attachments);
1255
2304
  const nonGuidelineAttachments = attachmentFiles.filter(
1256
2305
  (file) => !guidelineFiles.includes(file)
1257
2306
  );
1258
- const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
2307
+ const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
1259
2308
  if (prereadBlock.length > 0) {
1260
2309
  parts.push("\n", prereadBlock);
1261
2310
  }
1262
2311
  parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1263
2312
  return parts.join("\n").trim();
1264
2313
  }
1265
- function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
2314
+ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
1266
2315
  if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
1267
2316
  return "";
1268
2317
  }
1269
2318
  const buildList = (files) => files.map((absolutePath) => {
1270
- const fileName = import_node_path3.default.basename(absolutePath);
1271
- const fileUri = pathToFileUri(absolutePath);
2319
+ const fileName = import_node_path6.default.basename(absolutePath);
2320
+ const fileUri = pathToFileUri2(absolutePath);
1272
2321
  return `* [${fileName}](${fileUri})`;
1273
2322
  });
1274
2323
  const sections = [];
@@ -1286,14 +2335,14 @@ ${buildList(attachmentFiles).join("\n")}.`);
1286
2335
  );
1287
2336
  return sections.join("\n");
1288
2337
  }
1289
- function collectGuidelineFiles(attachments, guidelinePatterns) {
2338
+ function collectGuidelineFiles2(attachments, guidelinePatterns) {
1290
2339
  if (!attachments || attachments.length === 0) {
1291
2340
  return [];
1292
2341
  }
1293
2342
  const unique = /* @__PURE__ */ new Map();
1294
2343
  for (const attachment of attachments) {
1295
- const absolutePath = import_node_path3.default.resolve(attachment);
1296
- const normalized = absolutePath.split(import_node_path3.default.sep).join("/");
2344
+ const absolutePath = import_node_path6.default.resolve(attachment);
2345
+ const normalized = absolutePath.split(import_node_path6.default.sep).join("/");
1297
2346
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1298
2347
  if (!unique.has(absolutePath)) {
1299
2348
  unique.set(absolutePath, absolutePath);
@@ -1308,15 +2357,15 @@ function collectAttachmentFiles(attachments) {
1308
2357
  }
1309
2358
  const unique = /* @__PURE__ */ new Map();
1310
2359
  for (const attachment of attachments) {
1311
- const absolutePath = import_node_path3.default.resolve(attachment);
2360
+ const absolutePath = import_node_path6.default.resolve(attachment);
1312
2361
  if (!unique.has(absolutePath)) {
1313
2362
  unique.set(absolutePath, absolutePath);
1314
2363
  }
1315
2364
  }
1316
2365
  return Array.from(unique.values());
1317
2366
  }
1318
- function pathToFileUri(filePath) {
1319
- const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
2367
+ function pathToFileUri2(filePath) {
2368
+ const absolutePath = import_node_path6.default.isAbsolute(filePath) ? filePath : import_node_path6.default.resolve(filePath);
1320
2369
  const normalizedPath = absolutePath.replace(/\\/g, "/");
1321
2370
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1322
2371
  return `file:///${normalizedPath}`;
@@ -1329,7 +2378,7 @@ function normalizeAttachments(attachments) {
1329
2378
  }
1330
2379
  const deduped = /* @__PURE__ */ new Set();
1331
2380
  for (const attachment of attachments) {
1332
- deduped.add(import_node_path3.default.resolve(attachment));
2381
+ deduped.add(import_node_path6.default.resolve(attachment));
1333
2382
  }
1334
2383
  return Array.from(deduped);
1335
2384
  }
@@ -1337,8 +2386,8 @@ function mergeAttachments(all) {
1337
2386
  const deduped = /* @__PURE__ */ new Set();
1338
2387
  for (const list of all) {
1339
2388
  if (!list) continue;
1340
- for (const attachment of list) {
1341
- deduped.add(import_node_path3.default.resolve(attachment));
2389
+ for (const inputFile of list) {
2390
+ deduped.add(import_node_path6.default.resolve(inputFile));
1342
2391
  }
1343
2392
  }
1344
2393
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -1383,9 +2432,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
1383
2432
  }
1384
2433
 
1385
2434
  // src/evaluation/providers/targets-file.ts
1386
- var import_node_fs3 = require("fs");
1387
- var import_promises4 = require("fs/promises");
1388
- var import_node_path4 = __toESM(require("path"), 1);
2435
+ var import_node_fs4 = require("fs");
2436
+ var import_promises5 = require("fs/promises");
2437
+ var import_node_path7 = __toESM(require("path"), 1);
1389
2438
  var import_yaml2 = require("yaml");
1390
2439
 
1391
2440
  // src/evaluation/providers/types.ts
@@ -1446,18 +2495,18 @@ function assertTargetDefinition(value, index, filePath) {
1446
2495
  }
1447
2496
  async function fileExists3(filePath) {
1448
2497
  try {
1449
- await (0, import_promises4.access)(filePath, import_node_fs3.constants.F_OK);
2498
+ await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
1450
2499
  return true;
1451
2500
  } catch {
1452
2501
  return false;
1453
2502
  }
1454
2503
  }
1455
2504
  async function readTargetDefinitions(filePath) {
1456
- const absolutePath = import_node_path4.default.resolve(filePath);
2505
+ const absolutePath = import_node_path7.default.resolve(filePath);
1457
2506
  if (!await fileExists3(absolutePath)) {
1458
2507
  throw new Error(`targets.yaml not found at ${absolutePath}`);
1459
2508
  }
1460
- const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
2509
+ const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
1461
2510
  const parsed = (0, import_yaml2.parse)(raw);
1462
2511
  if (!isRecord(parsed)) {
1463
2512
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -1480,6 +2529,10 @@ function createProvider(target) {
1480
2529
  return new AnthropicProvider(target.name, target.config);
1481
2530
  case "gemini":
1482
2531
  return new GeminiProvider(target.name, target.config);
2532
+ case "cli":
2533
+ return new CliProvider(target.name, target.config);
2534
+ case "codex":
2535
+ return new CodexProvider(target.name, target.config);
1483
2536
  case "mock":
1484
2537
  return new MockProvider(target.name, target.config);
1485
2538
  case "vscode":
@@ -1496,230 +2549,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
1496
2549
  return createProvider(resolved);
1497
2550
  }
1498
2551
 
1499
- // src/evaluation/scoring.ts
1500
- var KEY_TERM_MATCH_THRESHOLD = 0.5;
1501
- var ACTION_WORDS = /* @__PURE__ */ new Set([
1502
- "use",
1503
- "avoid",
1504
- "prefer",
1505
- "replace",
1506
- "consider",
1507
- "ensure",
1508
- "remove",
1509
- "add"
1510
- ]);
1511
- var STOP_WORDS = /* @__PURE__ */ new Set([
1512
- "the",
1513
- "a",
1514
- "an",
1515
- "and",
1516
- "or",
1517
- "but",
1518
- "in",
1519
- "on",
1520
- "at",
1521
- "to",
1522
- "for",
1523
- "of",
1524
- "with",
1525
- "by",
1526
- "is",
1527
- "are",
1528
- "was",
1529
- "were",
1530
- "be",
1531
- "been",
1532
- "being",
1533
- "have",
1534
- "has",
1535
- "had",
1536
- "do",
1537
- "does",
1538
- "did",
1539
- "will",
1540
- "would",
1541
- "could",
1542
- "should"
1543
- ]);
1544
- var ERROR_PREFIXES = [
1545
- "error:",
1546
- "err:",
1547
- "vs code command failed",
1548
- "exception",
1549
- "traceback",
1550
- "no response file was generated",
1551
- "timed out",
1552
- "cli not found"
1553
- ];
1554
- function extractAspects(expectedResponse) {
1555
- const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
1556
- const aspects = [];
1557
- for (const line of lines) {
1558
- if (line.length === 0) {
1559
- continue;
1560
- }
1561
- const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
1562
- if (bulletMatch) {
1563
- const normalized = normalizeAspect(bulletMatch[2]);
1564
- if (normalized.length > 0) {
1565
- aspects.push(normalized);
1566
- }
1567
- continue;
1568
- }
1569
- const lowered = line.toLowerCase();
1570
- if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
1571
- const normalized = normalizeAspect(line);
1572
- if (normalized.length > 0) {
1573
- aspects.push(normalized);
1574
- }
1575
- }
1576
- }
1577
- return aspects;
1578
- }
1579
- function calculateHits(candidateResponse, expectedAspects) {
1580
- const { normalizedText, words } = normalizeCandidate(candidateResponse);
1581
- const hits = [];
1582
- for (const aspect of expectedAspects) {
1583
- if (matchesAspect(aspect, normalizedText, words)) {
1584
- hits.push(aspect);
1585
- }
1586
- }
1587
- return hits;
1588
- }
1589
- function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
1590
- const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
1591
- return expectedAspects.filter((aspect) => !hits.has(aspect));
1592
- }
1593
- function scoreCandidateResponse(candidateResponse, expectedAspects) {
1594
- if (expectedAspects.length === 0) {
1595
- if (isErrorLike(candidateResponse)) {
1596
- return {
1597
- score: 0,
1598
- hits: [],
1599
- misses: ["Model produced an error instead of an answer."],
1600
- hitCount: 0,
1601
- totalAspects: 0,
1602
- rawAspects: []
1603
- };
1604
- }
1605
- return {
1606
- score: 1,
1607
- hits: [],
1608
- misses: [],
1609
- hitCount: 0,
1610
- totalAspects: 0,
1611
- rawAspects: []
1612
- };
1613
- }
1614
- const hits = calculateHits(candidateResponse, expectedAspects);
1615
- const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
1616
- const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
1617
- return {
1618
- score,
1619
- hits,
1620
- misses,
1621
- hitCount: hits.length,
1622
- totalAspects: expectedAspects.length,
1623
- rawAspects: expectedAspects
1624
- };
1625
- }
1626
- function isErrorLike(text) {
1627
- if (!text) {
1628
- return false;
1629
- }
1630
- const lowered = text.trim().toLowerCase();
1631
- return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
1632
- }
1633
- function normalizeAspect(aspect) {
1634
- const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
1635
- return sanitized;
1636
- }
1637
- function normalizeCandidate(candidate) {
1638
- const lowered = candidate.toLowerCase();
1639
- const normalizedText = lowered.replace(/[^\w\s]/g, " ");
1640
- const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
1641
- return { normalizedText, words };
1642
- }
1643
- function matchesAspect(aspect, candidateNormalized, candidateWords) {
1644
- const keyTerms = extractKeyTerms(aspect);
1645
- if (keyTerms.length === 0) {
1646
- return false;
1647
- }
1648
- const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
1649
- const ratio = matches / keyTerms.length;
1650
- if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
1651
- return true;
1652
- }
1653
- const aspectWords = aspect.split(" ");
1654
- if (aspectWords.length >= 2) {
1655
- for (let index = 0; index < aspectWords.length - 1; index += 1) {
1656
- const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
1657
- if (candidateNormalized.includes(phrase)) {
1658
- return true;
1659
- }
1660
- }
1661
- }
1662
- return false;
1663
- }
1664
- function extractKeyTerms(aspect, maxTerms = 5) {
1665
- const terms = [];
1666
- const words = aspect.split(" ");
1667
- for (const word of words) {
1668
- if (word.length <= 2) {
1669
- continue;
1670
- }
1671
- if (STOP_WORDS.has(word)) {
1672
- continue;
1673
- }
1674
- terms.push(word);
1675
- if (terms.length >= maxTerms) {
1676
- break;
1677
- }
1678
- }
1679
- return terms;
1680
- }
1681
-
1682
- // src/evaluation/grading.ts
2552
+ // src/evaluation/evaluators.ts
1683
2553
  var import_node_crypto = require("crypto");
1684
- var HeuristicGrader = class {
1685
- kind = "heuristic";
1686
- grade(context) {
1687
- const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1688
- const result = scoreCandidateResponse(context.candidate, expectedAspects);
1689
- const misses = [...result.misses];
1690
- if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
1691
- const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
1692
- if (firstLine && !misses.includes(firstLine)) {
1693
- misses.unshift(firstLine);
1694
- }
1695
- }
1696
- return {
1697
- score: result.score,
1698
- hits: result.hits,
1699
- misses,
1700
- expectedAspectCount: result.totalAspects,
1701
- rawAspects: result.rawAspects
1702
- };
1703
- }
1704
- };
1705
- var QualityGrader = class {
2554
+ var LlmJudgeEvaluator = class {
1706
2555
  kind = "llm_judge";
1707
2556
  resolveJudgeProvider;
1708
2557
  maxOutputTokens;
1709
2558
  temperature;
2559
+ customPrompt;
1710
2560
  constructor(options) {
1711
2561
  this.resolveJudgeProvider = options.resolveJudgeProvider;
1712
2562
  this.maxOutputTokens = options.maxOutputTokens;
1713
2563
  this.temperature = options.temperature;
2564
+ this.customPrompt = options.customPrompt;
1714
2565
  }
1715
- async grade(context) {
2566
+ async evaluate(context) {
1716
2567
  const judgeProvider = await this.resolveJudgeProvider(context);
1717
2568
  if (!judgeProvider) {
1718
2569
  throw new Error("No judge provider available for LLM grading");
1719
2570
  }
1720
2571
  const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2572
+ const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
1721
2573
  const metadata = {
1722
- systemPrompt: QUALITY_SYSTEM_PROMPT
2574
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2575
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1723
2576
  };
1724
2577
  const response = await judgeProvider.invoke({
1725
2578
  prompt,
@@ -1734,12 +2587,13 @@ var QualityGrader = class {
1734
2587
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
1735
2588
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
1736
2589
  const reasoning = parsed.reasoning ?? response.reasoning;
1737
- const graderRawRequest = {
2590
+ const evaluatorRawRequest = {
1738
2591
  id: (0, import_node_crypto.randomUUID)(),
1739
2592
  provider: judgeProvider.id,
1740
2593
  prompt,
1741
- systemPrompt: QUALITY_SYSTEM_PROMPT,
1742
- target: context.target.name
2594
+ target: context.target.name,
2595
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2596
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1743
2597
  };
1744
2598
  return {
1745
2599
  score,
@@ -1747,7 +2601,7 @@ var QualityGrader = class {
1747
2601
  misses,
1748
2602
  expectedAspectCount: hits.length + misses.length || 1,
1749
2603
  reasoning,
1750
- graderRawRequest
2604
+ evaluatorRawRequest
1751
2605
  };
1752
2606
  }
1753
2607
  };
@@ -1865,11 +2719,117 @@ function extractJsonBlob(text) {
1865
2719
  function isNonEmptyString(value) {
1866
2720
  return typeof value === "string" && value.trim().length > 0;
1867
2721
  }
2722
+ var CodeEvaluator = class {
2723
+ kind = "code";
2724
+ script;
2725
+ cwd;
2726
+ agentTimeoutMs;
2727
+ constructor(options) {
2728
+ this.script = options.script;
2729
+ this.cwd = options.cwd;
2730
+ this.agentTimeoutMs = options.agentTimeoutMs;
2731
+ }
2732
+ async evaluate(context) {
2733
+ const inputPayload = JSON.stringify(
2734
+ {
2735
+ task: context.evalCase.task,
2736
+ outcome: context.evalCase.outcome,
2737
+ expected: context.evalCase.expected_assistant_raw,
2738
+ output: context.candidate,
2739
+ system_message: context.promptInputs.systemMessage ?? "",
2740
+ guideline_paths: context.evalCase.guideline_paths,
2741
+ attachments: context.evalCase.file_paths,
2742
+ user_segments: context.evalCase.user_segments
2743
+ },
2744
+ null,
2745
+ 2
2746
+ );
2747
+ try {
2748
+ const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
2749
+ const parsed = parseJsonSafe(stdout);
2750
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
2751
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
2752
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
2753
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
2754
+ return {
2755
+ score,
2756
+ hits,
2757
+ misses,
2758
+ expectedAspectCount: hits.length + misses.length || 1,
2759
+ reasoning,
2760
+ evaluatorRawRequest: {
2761
+ script: this.script,
2762
+ ...this.cwd ? { cwd: this.cwd } : {}
2763
+ }
2764
+ };
2765
+ } catch (error) {
2766
+ const message = error instanceof Error ? error.message : String(error);
2767
+ return {
2768
+ score: 0,
2769
+ hits: [],
2770
+ misses: [`Code evaluator failed: ${message}`],
2771
+ expectedAspectCount: 1,
2772
+ reasoning: message,
2773
+ evaluatorRawRequest: {
2774
+ script: this.script,
2775
+ ...this.cwd ? { cwd: this.cwd } : {},
2776
+ error: message
2777
+ }
2778
+ };
2779
+ }
2780
+ }
2781
+ };
2782
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
2783
+ const { spawn: spawn2 } = await import("child_process");
2784
+ return await new Promise((resolve, reject) => {
2785
+ const child = spawn2(scriptPath, {
2786
+ shell: true,
2787
+ cwd
2788
+ });
2789
+ let stdout = "";
2790
+ let stderr = "";
2791
+ const timeout = agentTimeoutMs ? setTimeout(() => {
2792
+ child.kill();
2793
+ reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
2794
+ }, agentTimeoutMs) : void 0;
2795
+ child.stdout?.on("data", (data) => {
2796
+ stdout += data.toString();
2797
+ });
2798
+ child.stderr?.on("data", (data) => {
2799
+ stderr += data.toString();
2800
+ });
2801
+ child.on("error", (error) => {
2802
+ if (timeout !== void 0) {
2803
+ clearTimeout(timeout);
2804
+ }
2805
+ reject(error);
2806
+ });
2807
+ child.on("exit", (code) => {
2808
+ if (timeout !== void 0) {
2809
+ clearTimeout(timeout);
2810
+ }
2811
+ if (code && code !== 0 && stderr.length > 0) {
2812
+ reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
2813
+ return;
2814
+ }
2815
+ resolve(stdout.trim());
2816
+ });
2817
+ child.stdin?.write(input);
2818
+ child.stdin?.end();
2819
+ });
2820
+ }
2821
+ function parseJsonSafe(payload) {
2822
+ try {
2823
+ return JSON.parse(payload);
2824
+ } catch {
2825
+ return void 0;
2826
+ }
2827
+ }
1868
2828
 
1869
2829
  // src/evaluation/orchestrator.ts
1870
2830
  var import_node_crypto2 = require("crypto");
1871
- var import_promises5 = require("fs/promises");
1872
- var import_node_path5 = __toESM(require("path"), 1);
2831
+ var import_promises6 = require("fs/promises");
2832
+ var import_node_path8 = __toESM(require("path"), 1);
1873
2833
 
1874
2834
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
1875
2835
  var Node = class {
@@ -2016,7 +2976,7 @@ async function runEvaluation(options) {
2016
2976
  targets,
2017
2977
  env,
2018
2978
  providerFactory,
2019
- graders,
2979
+ evaluators,
2020
2980
  maxRetries,
2021
2981
  agentTimeoutMs,
2022
2982
  promptDumpDir,
@@ -2075,7 +3035,7 @@ async function runEvaluation(options) {
2075
3035
  }
2076
3036
  return getOrCreateProvider(resolvedJudge);
2077
3037
  };
2078
- const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
3038
+ const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
2079
3039
  const primaryProvider = getOrCreateProvider(target);
2080
3040
  const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
2081
3041
  if (target.providerBatching && !providerSupportsBatch && verbose) {
@@ -2098,13 +3058,14 @@ async function runEvaluation(options) {
2098
3058
  evalCases: filteredEvalCases,
2099
3059
  provider: primaryProvider,
2100
3060
  target,
2101
- graderRegistry,
3061
+ evaluatorRegistry,
2102
3062
  promptDumpDir,
2103
3063
  nowFn: now ?? (() => /* @__PURE__ */ new Date()),
2104
3064
  onProgress,
2105
3065
  onResult,
2106
3066
  verbose,
2107
- resolveJudgeProvider
3067
+ resolveJudgeProvider,
3068
+ agentTimeoutMs
2108
3069
  });
2109
3070
  } catch (error) {
2110
3071
  if (verbose) {
@@ -2135,7 +3096,7 @@ async function runEvaluation(options) {
2135
3096
  evalCase,
2136
3097
  provider: primaryProvider,
2137
3098
  target,
2138
- graders: graderRegistry,
3099
+ evaluators: evaluatorRegistry,
2139
3100
  maxRetries,
2140
3101
  agentTimeoutMs,
2141
3102
  promptDumpDir,
@@ -2201,12 +3162,13 @@ async function runBatchEvaluation(options) {
2201
3162
  evalCases,
2202
3163
  provider,
2203
3164
  target,
2204
- graderRegistry,
3165
+ evaluatorRegistry,
2205
3166
  promptDumpDir,
2206
3167
  nowFn,
2207
3168
  onProgress,
2208
3169
  onResult,
2209
- resolveJudgeProvider
3170
+ resolveJudgeProvider,
3171
+ agentTimeoutMs
2210
3172
  } = options;
2211
3173
  const promptInputsList = [];
2212
3174
  for (const evalCase of evalCases) {
@@ -2222,7 +3184,7 @@ async function runBatchEvaluation(options) {
2222
3184
  prompt: promptInputs.request,
2223
3185
  guidelines: promptInputs.guidelines,
2224
3186
  guideline_patterns: evalCase.guideline_patterns,
2225
- attachments: evalCase.file_paths,
3187
+ inputFiles: evalCase.file_paths,
2226
3188
  evalCaseId: evalCase.id,
2227
3189
  metadata: {
2228
3190
  systemPrompt: promptInputs.systemMessage ?? ""
@@ -2254,23 +3216,19 @@ async function runBatchEvaluation(options) {
2254
3216
  const evalCase = evalCases[i];
2255
3217
  const promptInputs = promptInputsList[i];
2256
3218
  const providerResponse = batchResponse[i];
2257
- const now = nowFn();
2258
- const graderKind = evalCase.grader ?? "heuristic";
2259
- const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
2260
- if (!activeGrader) {
2261
- throw new Error(`No grader registered for kind '${graderKind}'`);
2262
- }
2263
- let grade;
3219
+ let result;
2264
3220
  try {
2265
- grade = await activeGrader.grade({
3221
+ result = await evaluateCandidate({
2266
3222
  evalCase,
2267
3223
  candidate: providerResponse.text ?? "",
2268
3224
  target,
2269
3225
  provider,
2270
- attempt: 0,
3226
+ evaluators: evaluatorRegistry,
2271
3227
  promptInputs,
2272
- now,
2273
- judgeProvider: await resolveJudgeProvider(target)
3228
+ nowFn,
3229
+ attempt: 0,
3230
+ judgeProvider: await resolveJudgeProvider(target),
3231
+ agentTimeoutMs
2274
3232
  });
2275
3233
  } catch (error) {
2276
3234
  const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
@@ -2289,28 +3247,6 @@ async function runBatchEvaluation(options) {
2289
3247
  }
2290
3248
  continue;
2291
3249
  }
2292
- const completedAt = nowFn();
2293
- const rawRequest = {
2294
- request: promptInputs.request,
2295
- guidelines: promptInputs.guidelines,
2296
- guideline_paths: evalCase.guideline_paths,
2297
- system_message: promptInputs.systemMessage ?? ""
2298
- };
2299
- const result = {
2300
- eval_id: evalCase.id,
2301
- conversation_id: evalCase.conversation_id,
2302
- score: grade.score,
2303
- hits: grade.hits,
2304
- misses: grade.misses,
2305
- model_answer: providerResponse.text ?? "",
2306
- expected_aspect_count: grade.expectedAspectCount,
2307
- target: target.name,
2308
- timestamp: completedAt.toISOString(),
2309
- reasoning: grade.reasoning,
2310
- raw_aspects: grade.rawAspects,
2311
- raw_request: rawRequest,
2312
- grader_raw_request: grade.graderRawRequest
2313
- };
2314
3250
  results.push(result);
2315
3251
  if (onResult) {
2316
3252
  await onResult(result);
@@ -2332,7 +3268,7 @@ async function runEvalCase(options) {
2332
3268
  evalCase,
2333
3269
  provider,
2334
3270
  target,
2335
- graders,
3271
+ evaluators,
2336
3272
  now,
2337
3273
  maxRetries,
2338
3274
  agentTimeoutMs,
@@ -2387,27 +3323,49 @@ async function runEvalCase(options) {
2387
3323
  if (cacheKey && cache && !cachedResponse) {
2388
3324
  await cache.set(cacheKey, providerResponse);
2389
3325
  }
2390
- const graderKind = evalCase.grader ?? "heuristic";
2391
- const activeGrader = graders[graderKind] ?? graders.heuristic;
2392
- if (!activeGrader) {
2393
- throw new Error(`No grader registered for kind '${graderKind}'`);
2394
- }
2395
- let grade;
2396
3326
  try {
2397
- const gradeTimestamp = nowFn();
2398
- grade = await activeGrader.grade({
3327
+ return await evaluateCandidate({
2399
3328
  evalCase,
2400
3329
  candidate: providerResponse.text ?? "",
2401
3330
  target,
2402
3331
  provider,
2403
- attempt,
3332
+ evaluators,
2404
3333
  promptInputs,
2405
- now: gradeTimestamp,
2406
- judgeProvider
3334
+ nowFn,
3335
+ attempt,
3336
+ judgeProvider,
3337
+ agentTimeoutMs
2407
3338
  });
2408
3339
  } catch (error) {
2409
3340
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2410
3341
  }
3342
+ }
3343
+ async function evaluateCandidate(options) {
3344
+ const {
3345
+ evalCase,
3346
+ candidate,
3347
+ target,
3348
+ provider,
3349
+ evaluators,
3350
+ promptInputs,
3351
+ nowFn,
3352
+ attempt,
3353
+ judgeProvider,
3354
+ agentTimeoutMs
3355
+ } = options;
3356
+ const gradeTimestamp = nowFn();
3357
+ const { score, evaluatorResults } = await runEvaluatorsForCase({
3358
+ evalCase,
3359
+ candidate,
3360
+ target,
3361
+ provider,
3362
+ evaluators,
3363
+ attempt,
3364
+ promptInputs,
3365
+ now: gradeTimestamp,
3366
+ judgeProvider,
3367
+ agentTimeoutMs
3368
+ });
2411
3369
  const completedAt = nowFn();
2412
3370
  const rawRequest = {
2413
3371
  request: promptInputs.request,
@@ -2418,28 +3376,200 @@ async function runEvalCase(options) {
2418
3376
  return {
2419
3377
  eval_id: evalCase.id,
2420
3378
  conversation_id: evalCase.conversation_id,
2421
- score: grade.score,
2422
- hits: grade.hits,
2423
- misses: grade.misses,
2424
- model_answer: providerResponse.text ?? "",
2425
- expected_aspect_count: grade.expectedAspectCount,
3379
+ score: score.score,
3380
+ hits: score.hits,
3381
+ misses: score.misses,
3382
+ model_answer: candidate,
3383
+ expected_aspect_count: score.expectedAspectCount,
2426
3384
  target: target.name,
2427
3385
  timestamp: completedAt.toISOString(),
2428
- reasoning: grade.reasoning,
2429
- raw_aspects: grade.rawAspects,
3386
+ reasoning: score.reasoning,
3387
+ raw_aspects: score.rawAspects,
2430
3388
  raw_request: rawRequest,
2431
- grader_raw_request: grade.graderRawRequest
3389
+ evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3390
+ evaluator_results: evaluatorResults
2432
3391
  };
2433
3392
  }
3393
+ async function runEvaluatorsForCase(options) {
3394
+ const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
3395
+ if (evalCase.evaluators && evalCase.evaluators.length > 0) {
3396
+ return runEvaluatorList({
3397
+ evalCase,
3398
+ evaluators: evalCase.evaluators,
3399
+ candidate,
3400
+ target,
3401
+ provider,
3402
+ evaluatorRegistry: evaluators,
3403
+ attempt,
3404
+ promptInputs,
3405
+ now,
3406
+ judgeProvider,
3407
+ agentTimeoutMs
3408
+ });
3409
+ }
3410
+ const evaluatorKind = evalCase.evaluator ?? "llm_judge";
3411
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
3412
+ if (!activeEvaluator) {
3413
+ throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
3414
+ }
3415
+ const score = await activeEvaluator.evaluate({
3416
+ evalCase,
3417
+ candidate,
3418
+ target,
3419
+ provider,
3420
+ attempt,
3421
+ promptInputs,
3422
+ now,
3423
+ judgeProvider
3424
+ });
3425
+ return { score };
3426
+ }
3427
+ async function runEvaluatorList(options) {
3428
+ const {
3429
+ evalCase,
3430
+ evaluators,
3431
+ candidate,
3432
+ target,
3433
+ provider,
3434
+ evaluatorRegistry,
3435
+ attempt,
3436
+ promptInputs,
3437
+ now,
3438
+ judgeProvider,
3439
+ agentTimeoutMs
3440
+ } = options;
3441
+ const scored = [];
3442
+ const evaluatorResults = [];
3443
+ for (const evaluator of evaluators ?? []) {
3444
+ try {
3445
+ if (evaluator.type === "llm_judge") {
3446
+ const score2 = await runLlmJudgeEvaluator({
3447
+ config: evaluator,
3448
+ evalCase,
3449
+ candidate,
3450
+ target,
3451
+ provider,
3452
+ evaluatorRegistry,
3453
+ attempt,
3454
+ promptInputs,
3455
+ now,
3456
+ judgeProvider
3457
+ });
3458
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3459
+ evaluatorResults.push({
3460
+ name: evaluator.name,
3461
+ type: evaluator.type,
3462
+ score: score2.score,
3463
+ hits: score2.hits,
3464
+ misses: score2.misses,
3465
+ reasoning: score2.reasoning,
3466
+ evaluator_raw_request: score2.evaluatorRawRequest
3467
+ });
3468
+ continue;
3469
+ }
3470
+ if (evaluator.type === "code") {
3471
+ const codeEvaluator = new CodeEvaluator({
3472
+ script: evaluator.script,
3473
+ cwd: evaluator.resolvedCwd ?? evaluator.cwd,
3474
+ agentTimeoutMs
3475
+ });
3476
+ const score2 = await codeEvaluator.evaluate({
3477
+ evalCase,
3478
+ candidate,
3479
+ target,
3480
+ provider,
3481
+ attempt,
3482
+ promptInputs,
3483
+ now
3484
+ });
3485
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3486
+ evaluatorResults.push({
3487
+ name: evaluator.name,
3488
+ type: evaluator.type,
3489
+ score: score2.score,
3490
+ hits: score2.hits,
3491
+ misses: score2.misses,
3492
+ reasoning: score2.reasoning,
3493
+ evaluator_raw_request: score2.evaluatorRawRequest
3494
+ });
3495
+ continue;
3496
+ }
3497
+ } catch (error) {
3498
+ const message = error instanceof Error ? error.message : String(error);
3499
+ const fallbackScore = {
3500
+ score: 0,
3501
+ hits: [],
3502
+ misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
3503
+ expectedAspectCount: 1,
3504
+ reasoning: message
3505
+ };
3506
+ scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
3507
+ evaluatorResults.push({
3508
+ name: evaluator.name ?? "unknown",
3509
+ type: evaluator.type ?? "unknown",
3510
+ score: 0,
3511
+ hits: [],
3512
+ misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
3513
+ reasoning: message
3514
+ });
3515
+ }
3516
+ }
3517
+ const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
3518
+ const hits = scored.flatMap((entry) => entry.score.hits);
3519
+ const misses = scored.flatMap((entry) => entry.score.misses);
3520
+ const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
3521
+ const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
3522
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
3523
+ const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
3524
+ const score = {
3525
+ score: aggregateScore,
3526
+ hits,
3527
+ misses,
3528
+ expectedAspectCount,
3529
+ reasoning,
3530
+ rawAspects: rawAspects.length > 0 ? rawAspects : void 0
3531
+ };
3532
+ return { score, evaluatorResults };
3533
+ }
3534
+ async function runLlmJudgeEvaluator(options) {
3535
+ const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
3536
+ const customPrompt = await resolveCustomPrompt(config);
3537
+ return evaluatorRegistry.llm_judge.evaluate({
3538
+ evalCase,
3539
+ candidate,
3540
+ target,
3541
+ provider,
3542
+ attempt,
3543
+ promptInputs,
3544
+ now,
3545
+ judgeProvider,
3546
+ systemPrompt: customPrompt,
3547
+ evaluator: config,
3548
+ judgeModel: config.model
3549
+ });
3550
+ }
3551
+ async function resolveCustomPrompt(config) {
3552
+ if (config.promptPath) {
3553
+ try {
3554
+ return await (0, import_promises6.readFile)(config.promptPath, "utf8");
3555
+ } catch (error) {
3556
+ const message = error instanceof Error ? error.message : String(error);
3557
+ console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
3558
+ }
3559
+ }
3560
+ return config.prompt;
3561
+ }
3562
+ function isNonEmptyString2(value) {
3563
+ return typeof value === "string" && value.trim().length > 0;
3564
+ }
2434
3565
  function filterEvalCases(evalCases, evalId) {
2435
3566
  if (!evalId) {
2436
3567
  return evalCases;
2437
3568
  }
2438
3569
  return evalCases.filter((evalCase) => evalCase.id === evalId);
2439
3570
  }
2440
- function buildGraderRegistry(overrides, resolveJudgeProvider) {
2441
- const heuristic = overrides?.heuristic ?? new HeuristicGrader();
2442
- const llmJudge = overrides?.llm_judge ?? new QualityGrader({
3571
+ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
3572
+ const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
2443
3573
  resolveJudgeProvider: async (context) => {
2444
3574
  if (context.judgeProvider) {
2445
3575
  return context.judgeProvider;
@@ -2449,22 +3579,21 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
2449
3579
  });
2450
3580
  return {
2451
3581
  ...overrides,
2452
- heuristic,
2453
3582
  llm_judge: llmJudge
2454
3583
  };
2455
3584
  }
2456
3585
  async function dumpPrompt(directory, evalCase, promptInputs) {
2457
3586
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2458
3587
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
2459
- const filePath = import_node_path5.default.resolve(directory, filename);
2460
- await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
3588
+ const filePath = import_node_path8.default.resolve(directory, filename);
3589
+ await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
2461
3590
  const payload = {
2462
3591
  eval_id: evalCase.id,
2463
3592
  request: promptInputs.request,
2464
3593
  guidelines: promptInputs.guidelines,
2465
3594
  guideline_paths: evalCase.guideline_paths
2466
3595
  };
2467
- await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
3596
+ await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
2468
3597
  }
2469
3598
  function sanitizeFilename(value) {
2470
3599
  if (!value) {
@@ -2474,7 +3603,7 @@ function sanitizeFilename(value) {
2474
3603
  return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
2475
3604
  }
2476
3605
  async function invokeProvider(provider, options) {
2477
- const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
3606
+ const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
2478
3607
  const controller = new AbortController();
2479
3608
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2480
3609
  if (signal) {
@@ -2485,7 +3614,7 @@ async function invokeProvider(provider, options) {
2485
3614
  prompt: promptInputs.request,
2486
3615
  guidelines: promptInputs.guidelines,
2487
3616
  guideline_patterns: evalCase.guideline_patterns,
2488
- attachments: evalCase.file_paths,
3617
+ inputFiles: evalCase.file_paths,
2489
3618
  evalCaseId: evalCase.id,
2490
3619
  attempt,
2491
3620
  metadata: {
@@ -2554,25 +3683,20 @@ function createAgentKernel() {
2554
3683
  }
2555
3684
  // Annotate the CommonJS export names for ESM import in node:
2556
3685
  0 && (module.exports = {
2557
- GRADER_KINDS,
2558
- HeuristicGrader,
2559
- QualityGrader,
3686
+ CodeEvaluator,
3687
+ LlmJudgeEvaluator,
2560
3688
  TEST_MESSAGE_ROLES,
2561
3689
  buildDirectoryChain,
2562
3690
  buildPromptInputs,
2563
3691
  buildSearchRoots,
2564
- calculateHits,
2565
- calculateMisses,
2566
3692
  createAgentKernel,
2567
3693
  createProvider,
2568
3694
  ensureVSCodeSubagents,
2569
- extractAspects,
2570
3695
  extractCodeBlocks,
2571
3696
  fileExists,
2572
3697
  findGitRoot,
2573
3698
  getHitCount,
2574
- isErrorLike,
2575
- isGraderKind,
3699
+ isEvaluatorKind,
2576
3700
  isGuidelineFile,
2577
3701
  isJsonObject,
2578
3702
  isJsonValue,
@@ -2585,7 +3709,6 @@ function createAgentKernel() {
2585
3709
  resolveFileReference,
2586
3710
  resolveTargetDefinition,
2587
3711
  runEvalCase,
2588
- runEvaluation,
2589
- scoreCandidateResponse
3712
+ runEvaluation
2590
3713
  });
2591
3714
  //# sourceMappingURL=index.cjs.map