@agentv/core 0.2.11 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -30,25 +30,20 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
- GRADER_KINDS: () => GRADER_KINDS,
34
- HeuristicGrader: () => HeuristicGrader,
35
- QualityGrader: () => QualityGrader,
33
+ CodeEvaluator: () => CodeEvaluator,
34
+ LlmJudgeEvaluator: () => LlmJudgeEvaluator,
36
35
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
36
  buildDirectoryChain: () => buildDirectoryChain,
38
37
  buildPromptInputs: () => buildPromptInputs,
39
38
  buildSearchRoots: () => buildSearchRoots,
40
- calculateHits: () => calculateHits,
41
- calculateMisses: () => calculateMisses,
42
39
  createAgentKernel: () => createAgentKernel,
43
40
  createProvider: () => createProvider,
44
41
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
45
- extractAspects: () => extractAspects,
46
42
  extractCodeBlocks: () => extractCodeBlocks,
47
43
  fileExists: () => fileExists,
48
44
  findGitRoot: () => findGitRoot,
49
45
  getHitCount: () => getHitCount,
50
- isErrorLike: () => isErrorLike,
51
- isGraderKind: () => isGraderKind,
46
+ isEvaluatorKind: () => isEvaluatorKind,
52
47
  isGuidelineFile: () => isGuidelineFile,
53
48
  isJsonObject: () => isJsonObject,
54
49
  isJsonValue: () => isJsonValue,
@@ -61,8 +56,7 @@ __export(index_exports, {
61
56
  resolveFileReference: () => resolveFileReference,
62
57
  resolveTargetDefinition: () => resolveTargetDefinition,
63
58
  runEvalCase: () => runEvalCase,
64
- runEvaluation: () => runEvaluation,
65
- scoreCandidateResponse: () => scoreCandidateResponse
59
+ runEvaluation: () => runEvaluation
66
60
  });
67
61
  module.exports = __toCommonJS(index_exports);
68
62
 
@@ -107,11 +101,10 @@ function isTestMessage(value) {
107
101
  }
108
102
  return candidate.content.every(isJsonObject);
109
103
  }
110
- var GRADER_KIND_VALUES = ["heuristic", "llm_judge"];
111
- var GRADER_KINDS = GRADER_KIND_VALUES;
112
- var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
113
- function isGraderKind(value) {
114
- return typeof value === "string" && GRADER_KIND_SET.has(value);
104
+ var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
105
+ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
106
+ function isEvaluatorKind(value) {
107
+ return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
115
108
  }
116
109
  function getHitCount(result) {
117
110
  return result.hits.length;
@@ -325,7 +318,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
325
318
  if (!Array.isArray(rawTestcases)) {
326
319
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
327
320
  }
328
- const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
321
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
329
322
  const results = [];
330
323
  for (const rawEvalcase of rawTestcases) {
331
324
  if (!isJsonObject(rawEvalcase)) {
@@ -448,7 +441,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
448
441
  const assistantContent = assistantMessages[0]?.content;
449
442
  const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
450
443
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
451
- const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
444
+ const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
445
+ const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
452
446
  const userFilePaths = [];
453
447
  for (const segment of userSegments) {
454
448
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -471,7 +465,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
471
465
  file_paths: allFilePaths,
472
466
  code_snippets: codeSnippets,
473
467
  outcome,
474
- grader: testCaseGrader
468
+ evaluator: testCaseEvaluatorKind,
469
+ evaluators
475
470
  };
476
471
  if (verbose) {
477
472
  console.log(`
@@ -632,14 +627,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
632
627
  }
633
628
  return parts.join(" ");
634
629
  }
635
- function coerceGrader(candidate) {
630
+ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
631
+ const execution = rawEvalCase.execution;
632
+ const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
633
+ if (candidateEvaluators === void 0) {
634
+ return void 0;
635
+ }
636
+ if (!Array.isArray(candidateEvaluators)) {
637
+ logWarning(`Skipping evaluators for '${evalId}': expected array`);
638
+ return void 0;
639
+ }
640
+ const evaluators = [];
641
+ for (const rawEvaluator of candidateEvaluators) {
642
+ if (!isJsonObject(rawEvaluator)) {
643
+ logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
644
+ continue;
645
+ }
646
+ const name = asString(rawEvaluator.name);
647
+ const typeValue = rawEvaluator.type;
648
+ if (!name || !isEvaluatorKind(typeValue)) {
649
+ logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
650
+ continue;
651
+ }
652
+ if (typeValue === "code") {
653
+ const script = asString(rawEvaluator.script);
654
+ if (!script) {
655
+ logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
656
+ continue;
657
+ }
658
+ const cwd = asString(rawEvaluator.cwd);
659
+ let resolvedCwd;
660
+ if (cwd) {
661
+ const resolved = await resolveFileReference(cwd, searchRoots);
662
+ if (resolved.resolvedPath) {
663
+ resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
664
+ } else {
665
+ logWarning(
666
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
667
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
668
+ );
669
+ }
670
+ }
671
+ evaluators.push({
672
+ name,
673
+ type: "code",
674
+ script,
675
+ cwd,
676
+ resolvedCwd
677
+ });
678
+ continue;
679
+ }
680
+ const prompt = asString(rawEvaluator.prompt);
681
+ let promptPath;
682
+ if (prompt) {
683
+ const resolved = await resolveFileReference(prompt, searchRoots);
684
+ if (resolved.resolvedPath) {
685
+ promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
686
+ } else {
687
+ logWarning(
688
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
689
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
690
+ );
691
+ }
692
+ }
693
+ const model = asString(rawEvaluator.model);
694
+ evaluators.push({
695
+ name,
696
+ type: "llm_judge",
697
+ prompt,
698
+ promptPath,
699
+ model
700
+ });
701
+ }
702
+ return evaluators.length > 0 ? evaluators : void 0;
703
+ }
704
+ function coerceEvaluator(candidate, contextId) {
636
705
  if (typeof candidate !== "string") {
637
706
  return void 0;
638
707
  }
639
- if (isGraderKind(candidate)) {
708
+ if (isEvaluatorKind(candidate)) {
640
709
  return candidate;
641
710
  }
642
- logWarning(`Unknown grader '${candidate}', falling back to default`);
711
+ logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
643
712
  return void 0;
644
713
  }
645
714
  function logWarning(message, details) {
@@ -835,6 +904,214 @@ var GeminiProvider = class {
835
904
  }
836
905
  };
837
906
 
907
+ // src/evaluation/providers/cli.ts
908
+ var import_node_child_process = require("child_process");
909
+ var import_node_path3 = __toESM(require("path"), 1);
910
+ var import_node_util = require("util");
911
+ var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
912
+ var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
913
+ async function defaultCommandRunner(command, options) {
914
+ const execOptions = {
915
+ cwd: options.cwd,
916
+ env: options.env,
917
+ timeout: options.timeoutMs,
918
+ signal: options.signal,
919
+ maxBuffer: DEFAULT_MAX_BUFFER,
920
+ shell: process.platform === "win32" ? "powershell.exe" : void 0
921
+ };
922
+ try {
923
+ const { stdout, stderr } = await execAsync(command, execOptions);
924
+ return {
925
+ stdout,
926
+ stderr,
927
+ exitCode: 0,
928
+ failed: false,
929
+ timedOut: false,
930
+ signal: null
931
+ };
932
+ } catch (error) {
933
+ const execError = error;
934
+ return {
935
+ stdout: execError.stdout ?? "",
936
+ stderr: execError.stderr ?? "",
937
+ exitCode: typeof execError.code === "number" ? execError.code : null,
938
+ failed: true,
939
+ timedOut: execError.timedOut === true || execError.killed === true,
940
+ signal: execError.signal ?? null
941
+ };
942
+ }
943
+ }
944
+ var CliProvider = class {
945
+ id;
946
+ kind = "cli";
947
+ targetName;
948
+ supportsBatch = false;
949
+ config;
950
+ runCommand;
951
+ healthcheckPromise;
952
+ constructor(targetName, config, runner = defaultCommandRunner) {
953
+ this.targetName = targetName;
954
+ this.id = `cli:${targetName}`;
955
+ this.config = config;
956
+ this.runCommand = runner;
957
+ }
958
+ async invoke(request) {
959
+ if (request.signal?.aborted) {
960
+ throw new Error("CLI provider request was aborted before execution");
961
+ }
962
+ await this.ensureHealthy(request.signal);
963
+ const templateValues = buildTemplateValues(request, this.config);
964
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
965
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
966
+ const result = await this.runCommand(renderedCommand, {
967
+ cwd: this.config.cwd,
968
+ env,
969
+ timeoutMs: this.config.timeoutMs,
970
+ signal: request.signal
971
+ });
972
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
973
+ if (request.signal?.aborted) {
974
+ throw new Error("CLI provider request was aborted");
975
+ }
976
+ if (result.timedOut) {
977
+ throw new Error(
978
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
979
+ );
980
+ }
981
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
982
+ const detail = result.stderr.trim() || result.stdout.trim();
983
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
984
+ throw new Error(message);
985
+ }
986
+ return {
987
+ text: result.stdout,
988
+ raw: {
989
+ command: renderedCommand,
990
+ stderr: result.stderr,
991
+ exitCode: result.exitCode ?? 0,
992
+ cwd: this.config.cwd
993
+ }
994
+ };
995
+ }
996
+ async ensureHealthy(signal) {
997
+ if (!this.config.healthcheck) {
998
+ return;
999
+ }
1000
+ if (!this.healthcheckPromise) {
1001
+ this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
1002
+ }
1003
+ return this.healthcheckPromise;
1004
+ }
1005
+ async runHealthcheck(healthcheck, signal) {
1006
+ if (!healthcheck) {
1007
+ return;
1008
+ }
1009
+ const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
1010
+ if (healthcheck.type === "http") {
1011
+ const controller = new AbortController();
1012
+ const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
1013
+ signal?.addEventListener("abort", () => controller.abort(), { once: true });
1014
+ try {
1015
+ const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
1016
+ if (!response.ok) {
1017
+ throw new Error(`HTTP ${response.status} ${response.statusText}`);
1018
+ }
1019
+ } catch (error) {
1020
+ const reason = error instanceof Error ? error.message : String(error);
1021
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
1022
+ } finally {
1023
+ if (timer !== void 0) {
1024
+ clearTimeout(timer);
1025
+ }
1026
+ }
1027
+ return;
1028
+ }
1029
+ const renderedCommand = renderTemplate(
1030
+ healthcheck.commandTemplate,
1031
+ buildTemplateValues(
1032
+ {
1033
+ prompt: "",
1034
+ guidelines: "",
1035
+ inputFiles: [],
1036
+ evalCaseId: "",
1037
+ attempt: 0
1038
+ },
1039
+ this.config
1040
+ )
1041
+ );
1042
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
1043
+ const result = await this.runCommand(renderedCommand, {
1044
+ cwd: healthcheck.cwd ?? this.config.cwd,
1045
+ env,
1046
+ timeoutMs,
1047
+ signal
1048
+ });
1049
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
1050
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
1051
+ const detail = result.stderr.trim() || result.stdout.trim();
1052
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
1053
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
1054
+ }
1055
+ }
1056
+ };
1057
+ function buildTemplateValues(request, config) {
1058
+ const inputFiles = normalizeInputFiles(request.inputFiles);
1059
+ return {
1060
+ PROMPT: shellEscape(request.prompt ?? ""),
1061
+ GUIDELINES: shellEscape(request.guidelines ?? ""),
1062
+ EVAL_ID: shellEscape(request.evalCaseId ?? ""),
1063
+ ATTEMPT: shellEscape(String(request.attempt ?? 0)),
1064
+ FILES: formatFileList(inputFiles, config.filesFormat)
1065
+ };
1066
+ }
1067
+ function normalizeInputFiles(inputFiles) {
1068
+ if (!inputFiles || inputFiles.length === 0) {
1069
+ return void 0;
1070
+ }
1071
+ const unique = /* @__PURE__ */ new Map();
1072
+ for (const inputFile of inputFiles) {
1073
+ const absolutePath = import_node_path3.default.resolve(inputFile);
1074
+ if (!unique.has(absolutePath)) {
1075
+ unique.set(absolutePath, absolutePath);
1076
+ }
1077
+ }
1078
+ return Array.from(unique.values());
1079
+ }
1080
+ function formatFileList(files, template) {
1081
+ if (!files || files.length === 0) {
1082
+ return "";
1083
+ }
1084
+ const formatter = template ?? "{path}";
1085
+ return files.map((filePath) => {
1086
+ const escapedPath = shellEscape(filePath);
1087
+ const escapedName = shellEscape(import_node_path3.default.basename(filePath));
1088
+ return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
1089
+ }).join(" ");
1090
+ }
1091
+ function renderTemplate(template, values) {
1092
+ return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
1093
+ const replacement = values[key];
1094
+ return replacement !== void 0 ? replacement : match;
1095
+ });
1096
+ }
1097
+ function shellEscape(value) {
1098
+ if (value.length === 0) {
1099
+ return "''";
1100
+ }
1101
+ if (process.platform === "win32") {
1102
+ const escaped = value.replace(/"/g, '\\"');
1103
+ return `"${escaped}"`;
1104
+ }
1105
+ return `'${value.replace(/'/g, `'"'"'`)}'`;
1106
+ }
1107
+ function formatTimeoutSuffix(timeoutMs) {
1108
+ if (!timeoutMs || timeoutMs <= 0) {
1109
+ return "";
1110
+ }
1111
+ const seconds = Math.ceil(timeoutMs / 1e3);
1112
+ return ` after ${seconds}s`;
1113
+ }
1114
+
838
1115
  // src/evaluation/providers/mock.ts
839
1116
  var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
840
1117
  var MockProvider = class {
@@ -878,6 +1155,7 @@ var MockProvider = class {
878
1155
 
879
1156
  // src/evaluation/providers/targets.ts
880
1157
  var import_zod = require("zod");
1158
+ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
881
1159
  var BASE_TARGET_SCHEMA = import_zod.z.object({
882
1160
  name: import_zod.z.string().min(1, "target name is required"),
883
1161
  provider: import_zod.z.string().min(1, "provider is required"),
@@ -934,6 +1212,16 @@ function resolveTargetDefinition(definition, env = process.env) {
934
1212
  providerBatching,
935
1213
  config: resolveGeminiConfig(parsed, env)
936
1214
  };
1215
+ case "codex":
1216
+ case "codex-cli":
1217
+ return {
1218
+ kind: "codex",
1219
+ name: parsed.name,
1220
+ judgeTarget: parsed.judge_target,
1221
+ workers: parsed.workers,
1222
+ providerBatching,
1223
+ config: resolveCodexConfig(parsed, env)
1224
+ };
937
1225
  case "mock":
938
1226
  return {
939
1227
  kind: "mock",
@@ -953,6 +1241,15 @@ function resolveTargetDefinition(definition, env = process.env) {
953
1241
  providerBatching,
954
1242
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
955
1243
  };
1244
+ case "cli":
1245
+ return {
1246
+ kind: "cli",
1247
+ name: parsed.name,
1248
+ judgeTarget: parsed.judge_target,
1249
+ workers: parsed.workers,
1250
+ providerBatching,
1251
+ config: resolveCliConfig(parsed, env)
1252
+ };
956
1253
  default:
957
1254
  throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
958
1255
  }
@@ -1020,6 +1317,29 @@ function resolveGeminiConfig(target, env) {
1020
1317
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
1021
1318
  };
1022
1319
  }
1320
+ function resolveCodexConfig(target, env) {
1321
+ const settings = target.settings ?? {};
1322
+ const executableSource = settings.executable ?? settings.command ?? settings.binary;
1323
+ const argsSource = settings.args ?? settings.arguments;
1324
+ const cwdSource = settings.cwd;
1325
+ const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
1326
+ const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
1327
+ allowLiteral: true,
1328
+ optionalEnv: true
1329
+ }) ?? "codex";
1330
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
1331
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
1332
+ allowLiteral: true,
1333
+ optionalEnv: true
1334
+ });
1335
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
1336
+ return {
1337
+ executable,
1338
+ args,
1339
+ cwd,
1340
+ timeoutMs
1341
+ };
1342
+ }
1023
1343
  function resolveMockConfig(target) {
1024
1344
  const settings = target.settings ?? {};
1025
1345
  const response = typeof settings.response === "string" ? settings.response : void 0;
@@ -1049,6 +1369,125 @@ function resolveVSCodeConfig(target, env, insiders) {
1049
1369
  workspaceTemplate
1050
1370
  };
1051
1371
  }
1372
+ function resolveCliConfig(target, env) {
1373
+ const settings = target.settings ?? {};
1374
+ const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
1375
+ const filesFormat = resolveOptionalLiteralString(
1376
+ settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
1377
+ );
1378
+ const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
1379
+ allowLiteral: true,
1380
+ optionalEnv: true
1381
+ });
1382
+ const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
1383
+ const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
1384
+ const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
1385
+ const commandTemplate = resolveString(
1386
+ commandTemplateSource,
1387
+ env,
1388
+ `${target.name} CLI command template`,
1389
+ true
1390
+ );
1391
+ assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
1392
+ return {
1393
+ commandTemplate,
1394
+ filesFormat,
1395
+ cwd,
1396
+ env: envOverrides,
1397
+ timeoutMs,
1398
+ healthcheck
1399
+ };
1400
+ }
1401
+ function resolveEnvOverrides(source, env, targetName) {
1402
+ if (source === void 0 || source === null) {
1403
+ return void 0;
1404
+ }
1405
+ if (typeof source !== "object" || Array.isArray(source)) {
1406
+ throw new Error(`${targetName} env overrides must be an object map of strings`);
1407
+ }
1408
+ const entries = Object.entries(source);
1409
+ const resolved = {};
1410
+ for (const [key, value] of entries) {
1411
+ if (typeof value !== "string") {
1412
+ throw new Error(`${targetName} env override '${key}' must be a string`);
1413
+ }
1414
+ const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
1415
+ resolved[key] = resolvedValue;
1416
+ }
1417
+ return Object.keys(resolved).length > 0 ? resolved : void 0;
1418
+ }
1419
+ function resolveTimeoutMs(source, description) {
1420
+ const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
1421
+ if (seconds === void 0) {
1422
+ return void 0;
1423
+ }
1424
+ if (seconds <= 0) {
1425
+ throw new Error(`${description} must be greater than zero seconds`);
1426
+ }
1427
+ return Math.floor(seconds * 1e3);
1428
+ }
1429
+ function resolveCliHealthcheck(source, env, targetName) {
1430
+ if (source === void 0 || source === null) {
1431
+ return void 0;
1432
+ }
1433
+ if (typeof source !== "object" || Array.isArray(source)) {
1434
+ throw new Error(`${targetName} healthcheck must be an object`);
1435
+ }
1436
+ const candidate = source;
1437
+ const type = candidate.type;
1438
+ const timeoutMs = resolveTimeoutMs(
1439
+ candidate.timeout_seconds ?? candidate.timeoutSeconds,
1440
+ `${targetName} healthcheck timeout`
1441
+ );
1442
+ if (type === "http") {
1443
+ const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
1444
+ return {
1445
+ type: "http",
1446
+ url,
1447
+ timeoutMs
1448
+ };
1449
+ }
1450
+ if (type === "command") {
1451
+ const commandTemplate = resolveString(
1452
+ candidate.command_template ?? candidate.commandTemplate,
1453
+ env,
1454
+ `${targetName} healthcheck command template`,
1455
+ true
1456
+ );
1457
+ assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
1458
+ const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
1459
+ allowLiteral: true,
1460
+ optionalEnv: true
1461
+ });
1462
+ return {
1463
+ type: "command",
1464
+ commandTemplate,
1465
+ timeoutMs,
1466
+ cwd
1467
+ };
1468
+ }
1469
+ throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
1470
+ }
1471
+ function assertSupportedCliPlaceholders(template, description) {
1472
+ const placeholders = extractCliPlaceholders(template);
1473
+ for (const placeholder of placeholders) {
1474
+ if (!CLI_PLACEHOLDERS.has(placeholder)) {
1475
+ throw new Error(
1476
+ `${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
1477
+ );
1478
+ }
1479
+ }
1480
+ }
1481
+ function extractCliPlaceholders(template) {
1482
+ const matches = template.matchAll(/\{([A-Z_]+)\}/g);
1483
+ const results = [];
1484
+ for (const match of matches) {
1485
+ if (match[1]) {
1486
+ results.push(match[1]);
1487
+ }
1488
+ }
1489
+ return results;
1490
+ }
1052
1491
  function resolveString(source, env, description, allowLiteral = false) {
1053
1492
  const value = resolveOptionalString(source, env, description, {
1054
1493
  allowLiteral,
@@ -1079,11 +1518,14 @@ function resolveOptionalString(source, env, description, options) {
1079
1518
  }
1080
1519
  const allowLiteral = options?.allowLiteral ?? false;
1081
1520
  const optionalEnv = options?.optionalEnv ?? false;
1082
- if (!allowLiteral && isLikelyEnvReference(trimmed)) {
1521
+ const looksLikeEnv = isLikelyEnvReference(trimmed);
1522
+ if (looksLikeEnv) {
1083
1523
  if (optionalEnv) {
1084
1524
  return void 0;
1085
1525
  }
1086
- throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
1526
+ if (!allowLiteral) {
1527
+ throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
1528
+ }
1087
1529
  }
1088
1530
  return trimmed;
1089
1531
  }
@@ -1133,10 +1575,42 @@ function resolveOptionalBoolean(source) {
1133
1575
  function isLikelyEnvReference(value) {
1134
1576
  return /^[A-Z0-9_]+$/.test(value);
1135
1577
  }
1578
+ function resolveOptionalStringArray(source, env, description) {
1579
+ if (source === void 0 || source === null) {
1580
+ return void 0;
1581
+ }
1582
+ if (!Array.isArray(source)) {
1583
+ throw new Error(`${description} must be an array of strings`);
1584
+ }
1585
+ if (source.length === 0) {
1586
+ return void 0;
1587
+ }
1588
+ const resolved = [];
1589
+ for (let i = 0; i < source.length; i++) {
1590
+ const item = source[i];
1591
+ if (typeof item !== "string") {
1592
+ throw new Error(`${description}[${i}] must be a string`);
1593
+ }
1594
+ const trimmed = item.trim();
1595
+ if (trimmed.length === 0) {
1596
+ throw new Error(`${description}[${i}] cannot be empty`);
1597
+ }
1598
+ const envValue = env[trimmed];
1599
+ if (envValue !== void 0) {
1600
+ if (envValue.trim().length === 0) {
1601
+ throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
1602
+ }
1603
+ resolved.push(envValue);
1604
+ } else {
1605
+ resolved.push(trimmed);
1606
+ }
1607
+ }
1608
+ return resolved.length > 0 ? resolved : void 0;
1609
+ }
1136
1610
 
1137
1611
  // src/evaluation/providers/vscode.ts
1138
1612
  var import_promises3 = require("fs/promises");
1139
- var import_node_path3 = __toESM(require("path"), 1);
1613
+ var import_node_path4 = __toESM(require("path"), 1);
1140
1614
  var import_subagent = require("subagent");
1141
1615
  var VSCodeProvider = class {
1142
1616
  id;
@@ -1154,12 +1628,11 @@ var VSCodeProvider = class {
1154
1628
  if (request.signal?.aborted) {
1155
1629
  throw new Error("VS Code provider request was aborted before dispatch");
1156
1630
  }
1157
- const attachments = normalizeAttachments(request.attachments);
1158
- const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
1631
+ const inputFiles = normalizeAttachments(request.inputFiles);
1632
+ const promptContent = buildPromptDocument(request, inputFiles, request.guideline_patterns);
1159
1633
  const session = await (0, import_subagent.dispatchAgentSession)({
1160
1634
  userQuery: promptContent,
1161
- // Use full prompt content instead of just request.prompt
1162
- extraAttachments: attachments,
1635
+ extraAttachments: inputFiles,
1163
1636
  wait: this.config.waitForResponse,
1164
1637
  dryRun: this.config.dryRun,
1165
1638
  vscodeCmd: this.config.command,
@@ -1176,7 +1649,7 @@ var VSCodeProvider = class {
1176
1649
  text: "",
1177
1650
  raw: {
1178
1651
  session,
1179
- attachments
1652
+ inputFiles
1180
1653
  }
1181
1654
  };
1182
1655
  }
@@ -1185,7 +1658,7 @@ var VSCodeProvider = class {
1185
1658
  text: responseText,
1186
1659
  raw: {
1187
1660
  session,
1188
- attachments
1661
+ inputFiles
1189
1662
  }
1190
1663
  };
1191
1664
  }
@@ -1195,17 +1668,17 @@ var VSCodeProvider = class {
1195
1668
  }
1196
1669
  const normalizedRequests = requests.map((req) => ({
1197
1670
  request: req,
1198
- attachments: normalizeAttachments(req.attachments)
1671
+ inputFiles: normalizeAttachments(req.inputFiles)
1199
1672
  }));
1200
- const combinedAttachments = mergeAttachments(
1201
- normalizedRequests.map(({ attachments }) => attachments)
1673
+ const combinedInputFiles = mergeAttachments(
1674
+ normalizedRequests.map(({ inputFiles }) => inputFiles)
1202
1675
  );
1203
1676
  const userQueries = normalizedRequests.map(
1204
- ({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
1677
+ ({ request, inputFiles }) => buildPromptDocument(request, inputFiles, request.guideline_patterns)
1205
1678
  );
1206
1679
  const session = await (0, import_subagent.dispatchBatchAgent)({
1207
1680
  userQueries,
1208
- extraAttachments: combinedAttachments,
1681
+ extraAttachments: combinedInputFiles,
1209
1682
  wait: this.config.waitForResponse,
1210
1683
  dryRun: this.config.dryRun,
1211
1684
  vscodeCmd: this.config.command,
@@ -1218,12 +1691,12 @@ var VSCodeProvider = class {
1218
1691
  throw new Error(failure);
1219
1692
  }
1220
1693
  if (this.config.dryRun) {
1221
- return normalizedRequests.map(({ attachments }) => ({
1694
+ return normalizedRequests.map(({ inputFiles }) => ({
1222
1695
  text: "",
1223
1696
  raw: {
1224
1697
  session,
1225
- attachments,
1226
- allAttachments: combinedAttachments
1698
+ inputFiles,
1699
+ allInputFiles: combinedInputFiles
1227
1700
  }
1228
1701
  }));
1229
1702
  }
@@ -1239,8 +1712,8 @@ var VSCodeProvider = class {
1239
1712
  text: responseText,
1240
1713
  raw: {
1241
1714
  session,
1242
- attachments: normalizedRequests[index]?.attachments,
1243
- allAttachments: combinedAttachments,
1715
+ inputFiles: normalizedRequests[index]?.inputFiles,
1716
+ allInputFiles: combinedInputFiles,
1244
1717
  responseFile
1245
1718
  }
1246
1719
  });
@@ -1267,7 +1740,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
1267
1740
  return "";
1268
1741
  }
1269
1742
  const buildList = (files) => files.map((absolutePath) => {
1270
- const fileName = import_node_path3.default.basename(absolutePath);
1743
+ const fileName = import_node_path4.default.basename(absolutePath);
1271
1744
  const fileUri = pathToFileUri(absolutePath);
1272
1745
  return `* [${fileName}](${fileUri})`;
1273
1746
  });
@@ -1292,8 +1765,8 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
1292
1765
  }
1293
1766
  const unique = /* @__PURE__ */ new Map();
1294
1767
  for (const attachment of attachments) {
1295
- const absolutePath = import_node_path3.default.resolve(attachment);
1296
- const normalized = absolutePath.split(import_node_path3.default.sep).join("/");
1768
+ const absolutePath = import_node_path4.default.resolve(attachment);
1769
+ const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
1297
1770
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1298
1771
  if (!unique.has(absolutePath)) {
1299
1772
  unique.set(absolutePath, absolutePath);
@@ -1308,84 +1781,660 @@ function collectAttachmentFiles(attachments) {
1308
1781
  }
1309
1782
  const unique = /* @__PURE__ */ new Map();
1310
1783
  for (const attachment of attachments) {
1311
- const absolutePath = import_node_path3.default.resolve(attachment);
1784
+ const absolutePath = import_node_path4.default.resolve(attachment);
1312
1785
  if (!unique.has(absolutePath)) {
1313
1786
  unique.set(absolutePath, absolutePath);
1314
1787
  }
1315
1788
  }
1316
- return Array.from(unique.values());
1789
+ return Array.from(unique.values());
1790
+ }
1791
+ function pathToFileUri(filePath) {
1792
+ const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
1793
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
1794
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1795
+ return `file:///${normalizedPath}`;
1796
+ }
1797
+ return `file://${normalizedPath}`;
1798
+ }
1799
+ function normalizeAttachments(attachments) {
1800
+ if (!attachments || attachments.length === 0) {
1801
+ return void 0;
1802
+ }
1803
+ const deduped = /* @__PURE__ */ new Set();
1804
+ for (const attachment of attachments) {
1805
+ deduped.add(import_node_path4.default.resolve(attachment));
1806
+ }
1807
+ return Array.from(deduped);
1808
+ }
1809
+ function mergeAttachments(all) {
1810
+ const deduped = /* @__PURE__ */ new Set();
1811
+ for (const list of all) {
1812
+ if (!list) continue;
1813
+ for (const inputFile of list) {
1814
+ deduped.add(import_node_path4.default.resolve(inputFile));
1815
+ }
1816
+ }
1817
+ return deduped.size > 0 ? Array.from(deduped) : void 0;
1818
+ }
1819
+ async function ensureVSCodeSubagents(options) {
1820
+ const { kind, count, verbose = false } = options;
1821
+ const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
1822
+ const subagentRoot = (0, import_subagent.getSubagentRoot)(vscodeCmd);
1823
+ try {
1824
+ if (verbose) {
1825
+ console.log(`Provisioning ${count} subagent(s) via: subagent ${vscodeCmd} provision`);
1826
+ }
1827
+ const result = await (0, import_subagent.provisionSubagents)({
1828
+ targetRoot: subagentRoot,
1829
+ subagents: count,
1830
+ dryRun: false
1831
+ });
1832
+ if (verbose) {
1833
+ if (result.created.length > 0) {
1834
+ console.log(`Created ${result.created.length} new subagent(s)`);
1835
+ }
1836
+ if (result.skippedExisting.length > 0) {
1837
+ console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
1838
+ }
1839
+ console.log(`
1840
+ total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
1841
+ }
1842
+ return {
1843
+ provisioned: true,
1844
+ message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
1845
+ };
1846
+ } catch (error) {
1847
+ const errorMessage = error instanceof Error ? error.message : String(error);
1848
+ if (verbose) {
1849
+ console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
1850
+ }
1851
+ return {
1852
+ provisioned: false,
1853
+ message: `Provisioning failed: ${errorMessage}`
1854
+ };
1855
+ }
1856
+ }
1857
+
1858
+ // src/evaluation/providers/codex.ts
1859
+ var import_node_child_process2 = require("child_process");
1860
+ var import_node_fs3 = require("fs");
1861
+ var import_promises4 = require("fs/promises");
1862
+ var import_node_os = require("os");
1863
+ var import_node_path6 = __toESM(require("path"), 1);
1864
+ var import_node_util2 = require("util");
1865
+
1866
+ // src/evaluation/providers/preread.ts
1867
+ var import_node_path5 = __toESM(require("path"), 1);
1868
+ function buildPromptDocument2(request, inputFiles, options) {
1869
+ const parts = [];
1870
+ const guidelineFiles = collectGuidelineFiles2(
1871
+ inputFiles,
1872
+ options?.guidelinePatterns ?? request.guideline_patterns,
1873
+ options?.guidelineOverrides
1874
+ );
1875
+ const inputFilesList = collectInputFiles(inputFiles);
1876
+ const nonGuidelineInputFiles = inputFilesList.filter(
1877
+ (file) => !guidelineFiles.includes(file)
1878
+ );
1879
+ const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineInputFiles);
1880
+ if (prereadBlock.length > 0) {
1881
+ parts.push("\n", prereadBlock);
1882
+ }
1883
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1884
+ return parts.join("\n").trim();
1885
+ }
1886
+ function normalizeInputFiles2(inputFiles) {
1887
+ if (!inputFiles || inputFiles.length === 0) {
1888
+ return void 0;
1889
+ }
1890
+ const deduped = /* @__PURE__ */ new Map();
1891
+ for (const inputFile of inputFiles) {
1892
+ const absolutePath = import_node_path5.default.resolve(inputFile);
1893
+ if (!deduped.has(absolutePath)) {
1894
+ deduped.set(absolutePath, absolutePath);
1895
+ }
1896
+ }
1897
+ return Array.from(deduped.values());
1898
+ }
1899
+ function collectGuidelineFiles2(inputFiles, guidelinePatterns, overrides) {
1900
+ if (!inputFiles || inputFiles.length === 0) {
1901
+ return [];
1902
+ }
1903
+ const unique = /* @__PURE__ */ new Map();
1904
+ for (const inputFile of inputFiles) {
1905
+ const absolutePath = import_node_path5.default.resolve(inputFile);
1906
+ if (overrides?.has(absolutePath)) {
1907
+ if (!unique.has(absolutePath)) {
1908
+ unique.set(absolutePath, absolutePath);
1909
+ }
1910
+ continue;
1911
+ }
1912
+ const normalized = absolutePath.split(import_node_path5.default.sep).join("/");
1913
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1914
+ if (!unique.has(absolutePath)) {
1915
+ unique.set(absolutePath, absolutePath);
1916
+ }
1917
+ }
1918
+ }
1919
+ return Array.from(unique.values());
1920
+ }
1921
+ function collectInputFiles(inputFiles) {
1922
+ if (!inputFiles || inputFiles.length === 0) {
1923
+ return [];
1924
+ }
1925
+ const unique = /* @__PURE__ */ new Map();
1926
+ for (const inputFile of inputFiles) {
1927
+ const absolutePath = import_node_path5.default.resolve(inputFile);
1928
+ if (!unique.has(absolutePath)) {
1929
+ unique.set(absolutePath, absolutePath);
1930
+ }
1931
+ }
1932
+ return Array.from(unique.values());
1933
+ }
1934
+ function buildMandatoryPrereadBlock2(guidelineFiles, inputFiles) {
1935
+ if (guidelineFiles.length === 0 && inputFiles.length === 0) {
1936
+ return "";
1937
+ }
1938
+ const buildList = (files) => files.map((absolutePath) => {
1939
+ const fileName = import_node_path5.default.basename(absolutePath);
1940
+ const fileUri = pathToFileUri2(absolutePath);
1941
+ return `* [${fileName}](${fileUri})`;
1942
+ });
1943
+ const sections = [];
1944
+ if (guidelineFiles.length > 0) {
1945
+ sections.push(`Read all guideline files:
1946
+ ${buildList(guidelineFiles).join("\n")}.`);
1947
+ }
1948
+ if (inputFiles.length > 0) {
1949
+ sections.push(`Read all input files:
1950
+ ${buildList(inputFiles).join("\n")}.`);
1951
+ }
1952
+ sections.push(
1953
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1954
+ "Then apply system_instructions on the user query below."
1955
+ );
1956
+ return sections.join("\n");
1957
+ }
1958
+ function pathToFileUri2(filePath) {
1959
+ const absolutePath = import_node_path5.default.isAbsolute(filePath) ? filePath : import_node_path5.default.resolve(filePath);
1960
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
1961
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1962
+ return `file:///${normalizedPath}`;
1963
+ }
1964
+ return `file://${normalizedPath}`;
1965
+ }
1966
+
1967
+ // src/evaluation/providers/codex.ts
1968
+ var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
1969
+ var WORKSPACE_PREFIX = "agentv-codex-";
1970
+ var PROMPT_FILENAME = "prompt.md";
1971
+ var FILES_DIR = "files";
1972
+ var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
1973
+ var CodexProvider = class {
1974
+ id;
1975
+ kind = "codex";
1976
+ targetName;
1977
+ supportsBatch = false;
1978
+ config;
1979
+ runCodex;
1980
+ environmentCheck;
1981
+ resolvedExecutable;
1982
+ constructor(targetName, config, runner = defaultCodexRunner) {
1983
+ this.id = `codex:${targetName}`;
1984
+ this.targetName = targetName;
1985
+ this.config = config;
1986
+ this.runCodex = runner;
1987
+ }
1988
+ async invoke(request) {
1989
+ if (request.signal?.aborted) {
1990
+ throw new Error("Codex provider request was aborted before execution");
1991
+ }
1992
+ await this.ensureEnvironmentReady();
1993
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
1994
+ const originalGuidelines = new Set(
1995
+ collectGuidelineFiles2(inputFiles, request.guideline_patterns).map((file) => import_node_path6.default.resolve(file))
1996
+ );
1997
+ const workspaceRoot = await this.createWorkspace();
1998
+ try {
1999
+ const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
2000
+ inputFiles,
2001
+ workspaceRoot,
2002
+ originalGuidelines
2003
+ );
2004
+ const promptContent = buildPromptDocument2(request, mirroredInputFiles, {
2005
+ guidelinePatterns: request.guideline_patterns,
2006
+ guidelineOverrides: guidelineMirrors
2007
+ });
2008
+ const promptFile = import_node_path6.default.join(workspaceRoot, PROMPT_FILENAME);
2009
+ await (0, import_promises4.writeFile)(promptFile, promptContent, "utf8");
2010
+ const args = this.buildCodexArgs();
2011
+ const cwd = this.resolveCwd(workspaceRoot);
2012
+ const result = await this.executeCodex(args, cwd, promptContent, request.signal);
2013
+ if (result.timedOut) {
2014
+ throw new Error(
2015
+ `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
2016
+ );
2017
+ }
2018
+ if (result.exitCode !== 0) {
2019
+ const detail = pickDetail(result.stderr, result.stdout);
2020
+ const prefix = `Codex CLI exited with code ${result.exitCode}`;
2021
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
2022
+ }
2023
+ const parsed = parseCodexJson(result.stdout);
2024
+ const assistantText = extractAssistantText(parsed);
2025
+ return {
2026
+ text: assistantText,
2027
+ raw: {
2028
+ response: parsed,
2029
+ stdout: result.stdout,
2030
+ stderr: result.stderr,
2031
+ exitCode: result.exitCode,
2032
+ args,
2033
+ executable: this.resolvedExecutable ?? this.config.executable,
2034
+ promptFile,
2035
+ workspace: workspaceRoot,
2036
+ inputFiles: mirroredInputFiles
2037
+ }
2038
+ };
2039
+ } finally {
2040
+ await this.cleanupWorkspace(workspaceRoot);
2041
+ }
2042
+ }
2043
+ async ensureEnvironmentReady() {
2044
+ if (!this.environmentCheck) {
2045
+ this.environmentCheck = this.validateEnvironment();
2046
+ }
2047
+ await this.environmentCheck;
2048
+ }
2049
+ async validateEnvironment() {
2050
+ this.resolvedExecutable = await locateExecutable(this.config.executable);
2051
+ }
2052
+ resolveCwd(workspaceRoot) {
2053
+ if (!this.config.cwd) {
2054
+ return workspaceRoot;
2055
+ }
2056
+ return import_node_path6.default.resolve(this.config.cwd);
2057
+ }
2058
+ buildCodexArgs() {
2059
+ const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
2060
+ if (this.config.args && this.config.args.length > 0) {
2061
+ args.push(...this.config.args);
2062
+ }
2063
+ args.push("-");
2064
+ return args;
2065
+ }
2066
+ async executeCodex(args, cwd, promptContent, signal) {
2067
+ try {
2068
+ return await this.runCodex({
2069
+ executable: this.resolvedExecutable ?? this.config.executable,
2070
+ args,
2071
+ cwd,
2072
+ prompt: promptContent,
2073
+ timeoutMs: this.config.timeoutMs,
2074
+ env: process.env,
2075
+ signal
2076
+ });
2077
+ } catch (error) {
2078
+ const err = error;
2079
+ if (err.code === "ENOENT") {
2080
+ throw new Error(
2081
+ `Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
2082
+ );
2083
+ }
2084
+ throw error;
2085
+ }
2086
+ }
2087
+ async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
2088
+ if (!inputFiles || inputFiles.length === 0) {
2089
+ return {
2090
+ mirroredInputFiles: void 0,
2091
+ guidelineMirrors: /* @__PURE__ */ new Set()
2092
+ };
2093
+ }
2094
+ const filesRoot = import_node_path6.default.join(workspaceRoot, FILES_DIR);
2095
+ await (0, import_promises4.mkdir)(filesRoot, { recursive: true });
2096
+ const mirrored = [];
2097
+ const guidelineMirrors = /* @__PURE__ */ new Set();
2098
+ const nameCounts = /* @__PURE__ */ new Map();
2099
+ for (const inputFile of inputFiles) {
2100
+ const absoluteSource = import_node_path6.default.resolve(inputFile);
2101
+ const baseName = import_node_path6.default.basename(absoluteSource);
2102
+ const count = nameCounts.get(baseName) ?? 0;
2103
+ nameCounts.set(baseName, count + 1);
2104
+ const finalName = count === 0 ? baseName : `${baseName}.${count}`;
2105
+ const destination = import_node_path6.default.join(filesRoot, finalName);
2106
+ await (0, import_promises4.copyFile)(absoluteSource, destination);
2107
+ const resolvedDestination = import_node_path6.default.resolve(destination);
2108
+ mirrored.push(resolvedDestination);
2109
+ if (guidelineOriginals.has(absoluteSource)) {
2110
+ guidelineMirrors.add(resolvedDestination);
2111
+ }
2112
+ }
2113
+ return {
2114
+ mirroredInputFiles: mirrored,
2115
+ guidelineMirrors
2116
+ };
2117
+ }
2118
+ async createWorkspace() {
2119
+ return await (0, import_promises4.mkdtemp)(import_node_path6.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
2120
+ }
2121
+ async cleanupWorkspace(workspaceRoot) {
2122
+ try {
2123
+ await (0, import_promises4.rm)(workspaceRoot, { recursive: true, force: true });
2124
+ } catch {
2125
+ }
2126
+ }
2127
+ };
2128
+ async function locateExecutable(candidate) {
2129
+ const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
2130
+ if (includesPathSeparator) {
2131
+ const resolved = import_node_path6.default.isAbsolute(candidate) ? candidate : import_node_path6.default.resolve(candidate);
2132
+ const executablePath = await ensureWindowsExecutableVariant(resolved);
2133
+ await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2134
+ return executablePath;
2135
+ }
2136
+ const locator = process.platform === "win32" ? "where" : "which";
2137
+ try {
2138
+ const { stdout } = await execAsync2(`${locator} ${candidate}`);
2139
+ const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2140
+ const preferred = selectExecutableCandidate(lines);
2141
+ if (preferred) {
2142
+ const executablePath = await ensureWindowsExecutableVariant(preferred);
2143
+ await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2144
+ return executablePath;
2145
+ }
2146
+ } catch {
2147
+ }
2148
+ throw new Error(`Codex executable '${candidate}' was not found on PATH`);
2149
+ }
2150
+ function selectExecutableCandidate(candidates) {
2151
+ if (candidates.length === 0) {
2152
+ return void 0;
2153
+ }
2154
+ if (process.platform !== "win32") {
2155
+ return candidates[0];
2156
+ }
2157
+ const extensions = getWindowsExecutableExtensions();
2158
+ for (const ext of extensions) {
2159
+ const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
2160
+ if (match) {
2161
+ return match;
2162
+ }
2163
+ }
2164
+ return candidates[0];
2165
+ }
2166
+ async function ensureWindowsExecutableVariant(candidate) {
2167
+ if (process.platform !== "win32") {
2168
+ return candidate;
2169
+ }
2170
+ if (hasExecutableExtension(candidate)) {
2171
+ return candidate;
2172
+ }
2173
+ const extensions = getWindowsExecutableExtensions();
2174
+ for (const ext of extensions) {
2175
+ const withExtension = `${candidate}${ext}`;
2176
+ try {
2177
+ await (0, import_promises4.access)(withExtension, import_node_fs3.constants.F_OK);
2178
+ return withExtension;
2179
+ } catch {
2180
+ }
2181
+ }
2182
+ return candidate;
2183
+ }
2184
+ function hasExecutableExtension(candidate) {
2185
+ const lower = candidate.toLowerCase();
2186
+ return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
2187
+ }
2188
+ var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
2189
+ function getWindowsExecutableExtensions() {
2190
+ if (process.platform !== "win32") {
2191
+ return [];
2192
+ }
2193
+ const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
2194
+ return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
2195
+ }
2196
+ function parseCodexJson(output) {
2197
+ const trimmed = output.trim();
2198
+ if (trimmed.length === 0) {
2199
+ throw new Error("Codex CLI produced no output in --json mode");
2200
+ }
2201
+ try {
2202
+ return JSON.parse(trimmed);
2203
+ } catch {
2204
+ const lineObjects = parseJsonLines(trimmed);
2205
+ if (lineObjects) {
2206
+ return lineObjects;
2207
+ }
2208
+ const lastBrace = trimmed.lastIndexOf("{");
2209
+ if (lastBrace >= 0) {
2210
+ const candidate = trimmed.slice(lastBrace);
2211
+ try {
2212
+ return JSON.parse(candidate);
2213
+ } catch {
2214
+ }
2215
+ }
2216
+ const preview = trimmed.slice(0, 200);
2217
+ throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
2218
+ }
2219
+ }
2220
+ function extractAssistantText(parsed) {
2221
+ if (Array.isArray(parsed)) {
2222
+ const text = extractFromEventStream(parsed);
2223
+ if (text) {
2224
+ return text;
2225
+ }
2226
+ }
2227
+ if (!parsed || typeof parsed !== "object") {
2228
+ throw new Error("Codex CLI JSON response did not include an assistant message");
2229
+ }
2230
+ const record = parsed;
2231
+ const eventText = extractFromEvent(record);
2232
+ if (eventText) {
2233
+ return eventText;
2234
+ }
2235
+ const messages = Array.isArray(record.messages) ? record.messages : void 0;
2236
+ if (messages) {
2237
+ for (let index = messages.length - 1; index >= 0; index -= 1) {
2238
+ const entry = messages[index];
2239
+ if (!entry || typeof entry !== "object") {
2240
+ continue;
2241
+ }
2242
+ const role = entry.role;
2243
+ if (role !== "assistant") {
2244
+ continue;
2245
+ }
2246
+ const content = entry.content;
2247
+ const flattened = flattenContent(content);
2248
+ if (flattened) {
2249
+ return flattened;
2250
+ }
2251
+ }
2252
+ }
2253
+ const response = record.response;
2254
+ if (response && typeof response === "object") {
2255
+ const content = response.content;
2256
+ const flattened = flattenContent(content);
2257
+ if (flattened) {
2258
+ return flattened;
2259
+ }
2260
+ }
2261
+ const output = record.output;
2262
+ const flattenedOutput = flattenContent(output);
2263
+ if (flattenedOutput) {
2264
+ return flattenedOutput;
2265
+ }
2266
+ throw new Error("Codex CLI JSON response did not include an assistant message");
2267
+ }
2268
+ function extractFromEventStream(events) {
2269
+ for (let index = events.length - 1; index >= 0; index -= 1) {
2270
+ const candidate = events[index];
2271
+ const text = extractFromEvent(candidate);
2272
+ if (text) {
2273
+ return text;
2274
+ }
2275
+ }
2276
+ return void 0;
2277
+ }
2278
+ function extractFromEvent(event) {
2279
+ if (!event || typeof event !== "object") {
2280
+ return void 0;
2281
+ }
2282
+ const record = event;
2283
+ const type = typeof record.type === "string" ? record.type : void 0;
2284
+ if (type === JSONL_TYPE_ITEM_COMPLETED) {
2285
+ const item = record.item;
2286
+ const text = extractFromItem(item);
2287
+ if (text) {
2288
+ return text;
2289
+ }
2290
+ }
2291
+ const output = record.output ?? record.content;
2292
+ const flattened = flattenContent(output);
2293
+ if (flattened) {
2294
+ return flattened;
2295
+ }
2296
+ return void 0;
2297
+ }
2298
+ function extractFromItem(item) {
2299
+ if (!item || typeof item !== "object") {
2300
+ return void 0;
2301
+ }
2302
+ const record = item;
2303
+ const itemType = typeof record.type === "string" ? record.type : void 0;
2304
+ if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
2305
+ const text = flattenContent(record.text ?? record.content ?? record.output);
2306
+ if (text) {
2307
+ return text;
2308
+ }
2309
+ }
2310
+ return void 0;
1317
2311
  }
1318
- function pathToFileUri(filePath) {
1319
- const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
1320
- const normalizedPath = absolutePath.replace(/\\/g, "/");
1321
- if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1322
- return `file:///${normalizedPath}`;
2312
+ function flattenContent(value) {
2313
+ if (typeof value === "string") {
2314
+ return value;
1323
2315
  }
1324
- return `file://${normalizedPath}`;
2316
+ if (Array.isArray(value)) {
2317
+ const parts = value.map((segment) => {
2318
+ if (typeof segment === "string") {
2319
+ return segment;
2320
+ }
2321
+ if (segment && typeof segment === "object" && "text" in segment) {
2322
+ const text = segment.text;
2323
+ return typeof text === "string" ? text : void 0;
2324
+ }
2325
+ return void 0;
2326
+ }).filter((part) => typeof part === "string" && part.length > 0);
2327
+ return parts.length > 0 ? parts.join(" \n") : void 0;
2328
+ }
2329
+ if (value && typeof value === "object" && "text" in value) {
2330
+ const text = value.text;
2331
+ return typeof text === "string" ? text : void 0;
2332
+ }
2333
+ return void 0;
1325
2334
  }
1326
- function normalizeAttachments(attachments) {
1327
- if (!attachments || attachments.length === 0) {
2335
+ function parseJsonLines(output) {
2336
+ const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2337
+ if (lines.length <= 1) {
1328
2338
  return void 0;
1329
2339
  }
1330
- const deduped = /* @__PURE__ */ new Set();
1331
- for (const attachment of attachments) {
1332
- deduped.add(import_node_path3.default.resolve(attachment));
2340
+ const parsed = [];
2341
+ for (const line of lines) {
2342
+ try {
2343
+ parsed.push(JSON.parse(line));
2344
+ } catch {
2345
+ return void 0;
2346
+ }
1333
2347
  }
1334
- return Array.from(deduped);
2348
+ return parsed;
1335
2349
  }
1336
- function mergeAttachments(all) {
1337
- const deduped = /* @__PURE__ */ new Set();
1338
- for (const list of all) {
1339
- if (!list) continue;
1340
- for (const attachment of list) {
1341
- deduped.add(import_node_path3.default.resolve(attachment));
1342
- }
2350
+ function pickDetail(stderr, stdout) {
2351
+ const errorText = stderr.trim();
2352
+ if (errorText.length > 0) {
2353
+ return errorText;
1343
2354
  }
1344
- return deduped.size > 0 ? Array.from(deduped) : void 0;
2355
+ const stdoutText = stdout.trim();
2356
+ return stdoutText.length > 0 ? stdoutText : void 0;
1345
2357
  }
1346
- async function ensureVSCodeSubagents(options) {
1347
- const { kind, count, verbose = false } = options;
1348
- const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
1349
- const subagentRoot = (0, import_subagent.getSubagentRoot)(vscodeCmd);
1350
- try {
1351
- if (verbose) {
1352
- console.log(`Provisioning ${count} subagent(s) via: subagent ${vscodeCmd} provision`);
2358
+ function formatTimeoutSuffix2(timeoutMs) {
2359
+ if (!timeoutMs || timeoutMs <= 0) {
2360
+ return "";
2361
+ }
2362
+ const seconds = Math.ceil(timeoutMs / 1e3);
2363
+ return ` after ${seconds}s`;
2364
+ }
2365
+ async function defaultCodexRunner(options) {
2366
+ return await new Promise((resolve, reject) => {
2367
+ const child = (0, import_node_child_process2.spawn)(options.executable, options.args, {
2368
+ cwd: options.cwd,
2369
+ env: options.env,
2370
+ stdio: ["pipe", "pipe", "pipe"],
2371
+ shell: shouldShellExecute(options.executable)
2372
+ });
2373
+ let stdout = "";
2374
+ let stderr = "";
2375
+ let timedOut = false;
2376
+ const onAbort = () => {
2377
+ child.kill("SIGTERM");
2378
+ };
2379
+ if (options.signal) {
2380
+ if (options.signal.aborted) {
2381
+ onAbort();
2382
+ } else {
2383
+ options.signal.addEventListener("abort", onAbort, { once: true });
2384
+ }
1353
2385
  }
1354
- const result = await (0, import_subagent.provisionSubagents)({
1355
- targetRoot: subagentRoot,
1356
- subagents: count,
1357
- dryRun: false
2386
+ let timeoutHandle;
2387
+ if (options.timeoutMs && options.timeoutMs > 0) {
2388
+ timeoutHandle = setTimeout(() => {
2389
+ timedOut = true;
2390
+ child.kill("SIGTERM");
2391
+ }, options.timeoutMs);
2392
+ timeoutHandle.unref?.();
2393
+ }
2394
+ child.stdout.setEncoding("utf8");
2395
+ child.stdout.on("data", (chunk) => {
2396
+ stdout += chunk;
1358
2397
  });
1359
- if (verbose) {
1360
- if (result.created.length > 0) {
1361
- console.log(`Created ${result.created.length} new subagent(s)`);
2398
+ child.stderr.setEncoding("utf8");
2399
+ child.stderr.on("data", (chunk) => {
2400
+ stderr += chunk;
2401
+ });
2402
+ child.stdin.end(options.prompt);
2403
+ const cleanup = () => {
2404
+ if (timeoutHandle) {
2405
+ clearTimeout(timeoutHandle);
1362
2406
  }
1363
- if (result.skippedExisting.length > 0) {
1364
- console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
2407
+ if (options.signal) {
2408
+ options.signal.removeEventListener("abort", onAbort);
1365
2409
  }
1366
- console.log(`
1367
- total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
1368
- }
1369
- return {
1370
- provisioned: true,
1371
- message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
1372
- };
1373
- } catch (error) {
1374
- const errorMessage = error instanceof Error ? error.message : String(error);
1375
- if (verbose) {
1376
- console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
1377
- }
1378
- return {
1379
- provisioned: false,
1380
- message: `Provisioning failed: ${errorMessage}`
1381
2410
  };
2411
+ child.on("error", (error) => {
2412
+ cleanup();
2413
+ reject(error);
2414
+ });
2415
+ child.on("close", (code) => {
2416
+ cleanup();
2417
+ resolve({
2418
+ stdout,
2419
+ stderr,
2420
+ exitCode: typeof code === "number" ? code : -1,
2421
+ timedOut
2422
+ });
2423
+ });
2424
+ });
2425
+ }
2426
+ function shouldShellExecute(executable) {
2427
+ if (process.platform !== "win32") {
2428
+ return false;
1382
2429
  }
2430
+ const lower = executable.toLowerCase();
2431
+ return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
1383
2432
  }
1384
2433
 
1385
2434
  // src/evaluation/providers/targets-file.ts
1386
- var import_node_fs3 = require("fs");
1387
- var import_promises4 = require("fs/promises");
1388
- var import_node_path4 = __toESM(require("path"), 1);
2435
+ var import_node_fs4 = require("fs");
2436
+ var import_promises5 = require("fs/promises");
2437
+ var import_node_path7 = __toESM(require("path"), 1);
1389
2438
  var import_yaml2 = require("yaml");
1390
2439
 
1391
2440
  // src/evaluation/providers/types.ts
@@ -1446,18 +2495,18 @@ function assertTargetDefinition(value, index, filePath) {
1446
2495
  }
1447
2496
  async function fileExists3(filePath) {
1448
2497
  try {
1449
- await (0, import_promises4.access)(filePath, import_node_fs3.constants.F_OK);
2498
+ await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
1450
2499
  return true;
1451
2500
  } catch {
1452
2501
  return false;
1453
2502
  }
1454
2503
  }
1455
2504
  async function readTargetDefinitions(filePath) {
1456
- const absolutePath = import_node_path4.default.resolve(filePath);
2505
+ const absolutePath = import_node_path7.default.resolve(filePath);
1457
2506
  if (!await fileExists3(absolutePath)) {
1458
2507
  throw new Error(`targets.yaml not found at ${absolutePath}`);
1459
2508
  }
1460
- const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
2509
+ const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
1461
2510
  const parsed = (0, import_yaml2.parse)(raw);
1462
2511
  if (!isRecord(parsed)) {
1463
2512
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -1480,6 +2529,10 @@ function createProvider(target) {
1480
2529
  return new AnthropicProvider(target.name, target.config);
1481
2530
  case "gemini":
1482
2531
  return new GeminiProvider(target.name, target.config);
2532
+ case "cli":
2533
+ return new CliProvider(target.name, target.config);
2534
+ case "codex":
2535
+ return new CodexProvider(target.name, target.config);
1483
2536
  case "mock":
1484
2537
  return new MockProvider(target.name, target.config);
1485
2538
  case "vscode":
@@ -1496,230 +2549,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
1496
2549
  return createProvider(resolved);
1497
2550
  }
1498
2551
 
1499
- // src/evaluation/scoring.ts
1500
- var KEY_TERM_MATCH_THRESHOLD = 0.5;
1501
- var ACTION_WORDS = /* @__PURE__ */ new Set([
1502
- "use",
1503
- "avoid",
1504
- "prefer",
1505
- "replace",
1506
- "consider",
1507
- "ensure",
1508
- "remove",
1509
- "add"
1510
- ]);
1511
- var STOP_WORDS = /* @__PURE__ */ new Set([
1512
- "the",
1513
- "a",
1514
- "an",
1515
- "and",
1516
- "or",
1517
- "but",
1518
- "in",
1519
- "on",
1520
- "at",
1521
- "to",
1522
- "for",
1523
- "of",
1524
- "with",
1525
- "by",
1526
- "is",
1527
- "are",
1528
- "was",
1529
- "were",
1530
- "be",
1531
- "been",
1532
- "being",
1533
- "have",
1534
- "has",
1535
- "had",
1536
- "do",
1537
- "does",
1538
- "did",
1539
- "will",
1540
- "would",
1541
- "could",
1542
- "should"
1543
- ]);
1544
- var ERROR_PREFIXES = [
1545
- "error:",
1546
- "err:",
1547
- "vs code command failed",
1548
- "exception",
1549
- "traceback",
1550
- "no response file was generated",
1551
- "timed out",
1552
- "cli not found"
1553
- ];
1554
- function extractAspects(expectedResponse) {
1555
- const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
1556
- const aspects = [];
1557
- for (const line of lines) {
1558
- if (line.length === 0) {
1559
- continue;
1560
- }
1561
- const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
1562
- if (bulletMatch) {
1563
- const normalized = normalizeAspect(bulletMatch[2]);
1564
- if (normalized.length > 0) {
1565
- aspects.push(normalized);
1566
- }
1567
- continue;
1568
- }
1569
- const lowered = line.toLowerCase();
1570
- if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
1571
- const normalized = normalizeAspect(line);
1572
- if (normalized.length > 0) {
1573
- aspects.push(normalized);
1574
- }
1575
- }
1576
- }
1577
- return aspects;
1578
- }
1579
- function calculateHits(candidateResponse, expectedAspects) {
1580
- const { normalizedText, words } = normalizeCandidate(candidateResponse);
1581
- const hits = [];
1582
- for (const aspect of expectedAspects) {
1583
- if (matchesAspect(aspect, normalizedText, words)) {
1584
- hits.push(aspect);
1585
- }
1586
- }
1587
- return hits;
1588
- }
1589
- function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
1590
- const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
1591
- return expectedAspects.filter((aspect) => !hits.has(aspect));
1592
- }
1593
- function scoreCandidateResponse(candidateResponse, expectedAspects) {
1594
- if (expectedAspects.length === 0) {
1595
- if (isErrorLike(candidateResponse)) {
1596
- return {
1597
- score: 0,
1598
- hits: [],
1599
- misses: ["Model produced an error instead of an answer."],
1600
- hitCount: 0,
1601
- totalAspects: 0,
1602
- rawAspects: []
1603
- };
1604
- }
1605
- return {
1606
- score: 1,
1607
- hits: [],
1608
- misses: [],
1609
- hitCount: 0,
1610
- totalAspects: 0,
1611
- rawAspects: []
1612
- };
1613
- }
1614
- const hits = calculateHits(candidateResponse, expectedAspects);
1615
- const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
1616
- const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
1617
- return {
1618
- score,
1619
- hits,
1620
- misses,
1621
- hitCount: hits.length,
1622
- totalAspects: expectedAspects.length,
1623
- rawAspects: expectedAspects
1624
- };
1625
- }
1626
- function isErrorLike(text) {
1627
- if (!text) {
1628
- return false;
1629
- }
1630
- const lowered = text.trim().toLowerCase();
1631
- return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
1632
- }
1633
- function normalizeAspect(aspect) {
1634
- const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
1635
- return sanitized;
1636
- }
1637
- function normalizeCandidate(candidate) {
1638
- const lowered = candidate.toLowerCase();
1639
- const normalizedText = lowered.replace(/[^\w\s]/g, " ");
1640
- const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
1641
- return { normalizedText, words };
1642
- }
1643
- function matchesAspect(aspect, candidateNormalized, candidateWords) {
1644
- const keyTerms = extractKeyTerms(aspect);
1645
- if (keyTerms.length === 0) {
1646
- return false;
1647
- }
1648
- const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
1649
- const ratio = matches / keyTerms.length;
1650
- if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
1651
- return true;
1652
- }
1653
- const aspectWords = aspect.split(" ");
1654
- if (aspectWords.length >= 2) {
1655
- for (let index = 0; index < aspectWords.length - 1; index += 1) {
1656
- const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
1657
- if (candidateNormalized.includes(phrase)) {
1658
- return true;
1659
- }
1660
- }
1661
- }
1662
- return false;
1663
- }
1664
- function extractKeyTerms(aspect, maxTerms = 5) {
1665
- const terms = [];
1666
- const words = aspect.split(" ");
1667
- for (const word of words) {
1668
- if (word.length <= 2) {
1669
- continue;
1670
- }
1671
- if (STOP_WORDS.has(word)) {
1672
- continue;
1673
- }
1674
- terms.push(word);
1675
- if (terms.length >= maxTerms) {
1676
- break;
1677
- }
1678
- }
1679
- return terms;
1680
- }
1681
-
1682
- // src/evaluation/grading.ts
2552
+ // src/evaluation/evaluators.ts
1683
2553
  var import_node_crypto = require("crypto");
1684
- var HeuristicGrader = class {
1685
- kind = "heuristic";
1686
- grade(context) {
1687
- const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1688
- const result = scoreCandidateResponse(context.candidate, expectedAspects);
1689
- const misses = [...result.misses];
1690
- if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
1691
- const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
1692
- if (firstLine && !misses.includes(firstLine)) {
1693
- misses.unshift(firstLine);
1694
- }
1695
- }
1696
- return {
1697
- score: result.score,
1698
- hits: result.hits,
1699
- misses,
1700
- expectedAspectCount: result.totalAspects,
1701
- rawAspects: result.rawAspects
1702
- };
1703
- }
1704
- };
1705
- var QualityGrader = class {
2554
+ var LlmJudgeEvaluator = class {
1706
2555
  kind = "llm_judge";
1707
2556
  resolveJudgeProvider;
1708
2557
  maxOutputTokens;
1709
2558
  temperature;
2559
+ customPrompt;
1710
2560
  constructor(options) {
1711
2561
  this.resolveJudgeProvider = options.resolveJudgeProvider;
1712
2562
  this.maxOutputTokens = options.maxOutputTokens;
1713
2563
  this.temperature = options.temperature;
2564
+ this.customPrompt = options.customPrompt;
1714
2565
  }
1715
- async grade(context) {
2566
+ async evaluate(context) {
1716
2567
  const judgeProvider = await this.resolveJudgeProvider(context);
1717
2568
  if (!judgeProvider) {
1718
2569
  throw new Error("No judge provider available for LLM grading");
1719
2570
  }
1720
2571
  const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2572
+ const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
1721
2573
  const metadata = {
1722
- systemPrompt: QUALITY_SYSTEM_PROMPT
2574
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2575
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1723
2576
  };
1724
2577
  const response = await judgeProvider.invoke({
1725
2578
  prompt,
@@ -1734,12 +2587,13 @@ var QualityGrader = class {
1734
2587
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
1735
2588
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
1736
2589
  const reasoning = parsed.reasoning ?? response.reasoning;
1737
- const graderRawRequest = {
2590
+ const evaluatorRawRequest = {
1738
2591
  id: (0, import_node_crypto.randomUUID)(),
1739
2592
  provider: judgeProvider.id,
1740
2593
  prompt,
1741
- systemPrompt: QUALITY_SYSTEM_PROMPT,
1742
- target: context.target.name
2594
+ target: context.target.name,
2595
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2596
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1743
2597
  };
1744
2598
  return {
1745
2599
  score,
@@ -1747,7 +2601,7 @@ var QualityGrader = class {
1747
2601
  misses,
1748
2602
  expectedAspectCount: hits.length + misses.length || 1,
1749
2603
  reasoning,
1750
- graderRawRequest
2604
+ evaluatorRawRequest
1751
2605
  };
1752
2606
  }
1753
2607
  };
@@ -1865,11 +2719,117 @@ function extractJsonBlob(text) {
1865
2719
  function isNonEmptyString(value) {
1866
2720
  return typeof value === "string" && value.trim().length > 0;
1867
2721
  }
2722
+ var CodeEvaluator = class {
2723
+ kind = "code";
2724
+ script;
2725
+ cwd;
2726
+ agentTimeoutMs;
2727
+ constructor(options) {
2728
+ this.script = options.script;
2729
+ this.cwd = options.cwd;
2730
+ this.agentTimeoutMs = options.agentTimeoutMs;
2731
+ }
2732
+ async evaluate(context) {
2733
+ const inputPayload = JSON.stringify(
2734
+ {
2735
+ task: context.evalCase.task,
2736
+ outcome: context.evalCase.outcome,
2737
+ expected: context.evalCase.expected_assistant_raw,
2738
+ output: context.candidate,
2739
+ system_message: context.promptInputs.systemMessage ?? "",
2740
+ guideline_paths: context.evalCase.guideline_paths,
2741
+ attachments: context.evalCase.file_paths,
2742
+ user_segments: context.evalCase.user_segments
2743
+ },
2744
+ null,
2745
+ 2
2746
+ );
2747
+ try {
2748
+ const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
2749
+ const parsed = parseJsonSafe(stdout);
2750
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
2751
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
2752
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
2753
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
2754
+ return {
2755
+ score,
2756
+ hits,
2757
+ misses,
2758
+ expectedAspectCount: hits.length + misses.length || 1,
2759
+ reasoning,
2760
+ evaluatorRawRequest: {
2761
+ script: this.script,
2762
+ ...this.cwd ? { cwd: this.cwd } : {}
2763
+ }
2764
+ };
2765
+ } catch (error) {
2766
+ const message = error instanceof Error ? error.message : String(error);
2767
+ return {
2768
+ score: 0,
2769
+ hits: [],
2770
+ misses: [`Code evaluator failed: ${message}`],
2771
+ expectedAspectCount: 1,
2772
+ reasoning: message,
2773
+ evaluatorRawRequest: {
2774
+ script: this.script,
2775
+ ...this.cwd ? { cwd: this.cwd } : {},
2776
+ error: message
2777
+ }
2778
+ };
2779
+ }
2780
+ }
2781
+ };
2782
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
2783
+ const { spawn: spawn2 } = await import("child_process");
2784
+ return await new Promise((resolve, reject) => {
2785
+ const child = spawn2(scriptPath, {
2786
+ shell: true,
2787
+ cwd
2788
+ });
2789
+ let stdout = "";
2790
+ let stderr = "";
2791
+ const timeout = agentTimeoutMs ? setTimeout(() => {
2792
+ child.kill();
2793
+ reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
2794
+ }, agentTimeoutMs) : void 0;
2795
+ child.stdout?.on("data", (data) => {
2796
+ stdout += data.toString();
2797
+ });
2798
+ child.stderr?.on("data", (data) => {
2799
+ stderr += data.toString();
2800
+ });
2801
+ child.on("error", (error) => {
2802
+ if (timeout !== void 0) {
2803
+ clearTimeout(timeout);
2804
+ }
2805
+ reject(error);
2806
+ });
2807
+ child.on("exit", (code) => {
2808
+ if (timeout !== void 0) {
2809
+ clearTimeout(timeout);
2810
+ }
2811
+ if (code && code !== 0 && stderr.length > 0) {
2812
+ reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
2813
+ return;
2814
+ }
2815
+ resolve(stdout.trim());
2816
+ });
2817
+ child.stdin?.write(input);
2818
+ child.stdin?.end();
2819
+ });
2820
+ }
2821
+ function parseJsonSafe(payload) {
2822
+ try {
2823
+ return JSON.parse(payload);
2824
+ } catch {
2825
+ return void 0;
2826
+ }
2827
+ }
1868
2828
 
1869
2829
  // src/evaluation/orchestrator.ts
1870
2830
  var import_node_crypto2 = require("crypto");
1871
- var import_promises5 = require("fs/promises");
1872
- var import_node_path5 = __toESM(require("path"), 1);
2831
+ var import_promises6 = require("fs/promises");
2832
+ var import_node_path8 = __toESM(require("path"), 1);
1873
2833
 
1874
2834
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
1875
2835
  var Node = class {
@@ -2016,7 +2976,7 @@ async function runEvaluation(options) {
2016
2976
  targets,
2017
2977
  env,
2018
2978
  providerFactory,
2019
- graders,
2979
+ evaluators,
2020
2980
  maxRetries,
2021
2981
  agentTimeoutMs,
2022
2982
  promptDumpDir,
@@ -2075,7 +3035,7 @@ async function runEvaluation(options) {
2075
3035
  }
2076
3036
  return getOrCreateProvider(resolvedJudge);
2077
3037
  };
2078
- const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
3038
+ const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
2079
3039
  const primaryProvider = getOrCreateProvider(target);
2080
3040
  const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
2081
3041
  if (target.providerBatching && !providerSupportsBatch && verbose) {
@@ -2098,13 +3058,14 @@ async function runEvaluation(options) {
2098
3058
  evalCases: filteredEvalCases,
2099
3059
  provider: primaryProvider,
2100
3060
  target,
2101
- graderRegistry,
3061
+ evaluatorRegistry,
2102
3062
  promptDumpDir,
2103
3063
  nowFn: now ?? (() => /* @__PURE__ */ new Date()),
2104
3064
  onProgress,
2105
3065
  onResult,
2106
3066
  verbose,
2107
- resolveJudgeProvider
3067
+ resolveJudgeProvider,
3068
+ agentTimeoutMs
2108
3069
  });
2109
3070
  } catch (error) {
2110
3071
  if (verbose) {
@@ -2135,7 +3096,7 @@ async function runEvaluation(options) {
2135
3096
  evalCase,
2136
3097
  provider: primaryProvider,
2137
3098
  target,
2138
- graders: graderRegistry,
3099
+ evaluators: evaluatorRegistry,
2139
3100
  maxRetries,
2140
3101
  agentTimeoutMs,
2141
3102
  promptDumpDir,
@@ -2201,12 +3162,13 @@ async function runBatchEvaluation(options) {
2201
3162
  evalCases,
2202
3163
  provider,
2203
3164
  target,
2204
- graderRegistry,
3165
+ evaluatorRegistry,
2205
3166
  promptDumpDir,
2206
3167
  nowFn,
2207
3168
  onProgress,
2208
3169
  onResult,
2209
- resolveJudgeProvider
3170
+ resolveJudgeProvider,
3171
+ agentTimeoutMs
2210
3172
  } = options;
2211
3173
  const promptInputsList = [];
2212
3174
  for (const evalCase of evalCases) {
@@ -2222,7 +3184,7 @@ async function runBatchEvaluation(options) {
2222
3184
  prompt: promptInputs.request,
2223
3185
  guidelines: promptInputs.guidelines,
2224
3186
  guideline_patterns: evalCase.guideline_patterns,
2225
- attachments: evalCase.file_paths,
3187
+ inputFiles: evalCase.file_paths,
2226
3188
  evalCaseId: evalCase.id,
2227
3189
  metadata: {
2228
3190
  systemPrompt: promptInputs.systemMessage ?? ""
@@ -2254,23 +3216,19 @@ async function runBatchEvaluation(options) {
2254
3216
  const evalCase = evalCases[i];
2255
3217
  const promptInputs = promptInputsList[i];
2256
3218
  const providerResponse = batchResponse[i];
2257
- const now = nowFn();
2258
- const graderKind = evalCase.grader ?? "heuristic";
2259
- const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
2260
- if (!activeGrader) {
2261
- throw new Error(`No grader registered for kind '${graderKind}'`);
2262
- }
2263
- let grade;
3219
+ let result;
2264
3220
  try {
2265
- grade = await activeGrader.grade({
3221
+ result = await evaluateCandidate({
2266
3222
  evalCase,
2267
3223
  candidate: providerResponse.text ?? "",
2268
3224
  target,
2269
3225
  provider,
2270
- attempt: 0,
3226
+ evaluators: evaluatorRegistry,
2271
3227
  promptInputs,
2272
- now,
2273
- judgeProvider: await resolveJudgeProvider(target)
3228
+ nowFn,
3229
+ attempt: 0,
3230
+ judgeProvider: await resolveJudgeProvider(target),
3231
+ agentTimeoutMs
2274
3232
  });
2275
3233
  } catch (error) {
2276
3234
  const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
@@ -2289,28 +3247,6 @@ async function runBatchEvaluation(options) {
2289
3247
  }
2290
3248
  continue;
2291
3249
  }
2292
- const completedAt = nowFn();
2293
- const rawRequest = {
2294
- request: promptInputs.request,
2295
- guidelines: promptInputs.guidelines,
2296
- guideline_paths: evalCase.guideline_paths,
2297
- system_message: promptInputs.systemMessage ?? ""
2298
- };
2299
- const result = {
2300
- eval_id: evalCase.id,
2301
- conversation_id: evalCase.conversation_id,
2302
- score: grade.score,
2303
- hits: grade.hits,
2304
- misses: grade.misses,
2305
- model_answer: providerResponse.text ?? "",
2306
- expected_aspect_count: grade.expectedAspectCount,
2307
- target: target.name,
2308
- timestamp: completedAt.toISOString(),
2309
- reasoning: grade.reasoning,
2310
- raw_aspects: grade.rawAspects,
2311
- raw_request: rawRequest,
2312
- grader_raw_request: grade.graderRawRequest
2313
- };
2314
3250
  results.push(result);
2315
3251
  if (onResult) {
2316
3252
  await onResult(result);
@@ -2332,7 +3268,7 @@ async function runEvalCase(options) {
2332
3268
  evalCase,
2333
3269
  provider,
2334
3270
  target,
2335
- graders,
3271
+ evaluators,
2336
3272
  now,
2337
3273
  maxRetries,
2338
3274
  agentTimeoutMs,
@@ -2387,27 +3323,49 @@ async function runEvalCase(options) {
2387
3323
  if (cacheKey && cache && !cachedResponse) {
2388
3324
  await cache.set(cacheKey, providerResponse);
2389
3325
  }
2390
- const graderKind = evalCase.grader ?? "heuristic";
2391
- const activeGrader = graders[graderKind] ?? graders.heuristic;
2392
- if (!activeGrader) {
2393
- throw new Error(`No grader registered for kind '${graderKind}'`);
2394
- }
2395
- let grade;
2396
3326
  try {
2397
- const gradeTimestamp = nowFn();
2398
- grade = await activeGrader.grade({
3327
+ return await evaluateCandidate({
2399
3328
  evalCase,
2400
3329
  candidate: providerResponse.text ?? "",
2401
3330
  target,
2402
3331
  provider,
2403
- attempt,
3332
+ evaluators,
2404
3333
  promptInputs,
2405
- now: gradeTimestamp,
2406
- judgeProvider
3334
+ nowFn,
3335
+ attempt,
3336
+ judgeProvider,
3337
+ agentTimeoutMs
2407
3338
  });
2408
3339
  } catch (error) {
2409
3340
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2410
3341
  }
3342
+ }
3343
+ async function evaluateCandidate(options) {
3344
+ const {
3345
+ evalCase,
3346
+ candidate,
3347
+ target,
3348
+ provider,
3349
+ evaluators,
3350
+ promptInputs,
3351
+ nowFn,
3352
+ attempt,
3353
+ judgeProvider,
3354
+ agentTimeoutMs
3355
+ } = options;
3356
+ const gradeTimestamp = nowFn();
3357
+ const { score, evaluatorResults } = await runEvaluatorsForCase({
3358
+ evalCase,
3359
+ candidate,
3360
+ target,
3361
+ provider,
3362
+ evaluators,
3363
+ attempt,
3364
+ promptInputs,
3365
+ now: gradeTimestamp,
3366
+ judgeProvider,
3367
+ agentTimeoutMs
3368
+ });
2411
3369
  const completedAt = nowFn();
2412
3370
  const rawRequest = {
2413
3371
  request: promptInputs.request,
@@ -2418,28 +3376,200 @@ async function runEvalCase(options) {
2418
3376
  return {
2419
3377
  eval_id: evalCase.id,
2420
3378
  conversation_id: evalCase.conversation_id,
2421
- score: grade.score,
2422
- hits: grade.hits,
2423
- misses: grade.misses,
2424
- model_answer: providerResponse.text ?? "",
2425
- expected_aspect_count: grade.expectedAspectCount,
3379
+ score: score.score,
3380
+ hits: score.hits,
3381
+ misses: score.misses,
3382
+ model_answer: candidate,
3383
+ expected_aspect_count: score.expectedAspectCount,
2426
3384
  target: target.name,
2427
3385
  timestamp: completedAt.toISOString(),
2428
- reasoning: grade.reasoning,
2429
- raw_aspects: grade.rawAspects,
3386
+ reasoning: score.reasoning,
3387
+ raw_aspects: score.rawAspects,
2430
3388
  raw_request: rawRequest,
2431
- grader_raw_request: grade.graderRawRequest
3389
+ evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3390
+ evaluator_results: evaluatorResults
2432
3391
  };
2433
3392
  }
3393
+ async function runEvaluatorsForCase(options) {
3394
+ const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
3395
+ if (evalCase.evaluators && evalCase.evaluators.length > 0) {
3396
+ return runEvaluatorList({
3397
+ evalCase,
3398
+ evaluators: evalCase.evaluators,
3399
+ candidate,
3400
+ target,
3401
+ provider,
3402
+ evaluatorRegistry: evaluators,
3403
+ attempt,
3404
+ promptInputs,
3405
+ now,
3406
+ judgeProvider,
3407
+ agentTimeoutMs
3408
+ });
3409
+ }
3410
+ const evaluatorKind = evalCase.evaluator ?? "llm_judge";
3411
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
3412
+ if (!activeEvaluator) {
3413
+ throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
3414
+ }
3415
+ const score = await activeEvaluator.evaluate({
3416
+ evalCase,
3417
+ candidate,
3418
+ target,
3419
+ provider,
3420
+ attempt,
3421
+ promptInputs,
3422
+ now,
3423
+ judgeProvider
3424
+ });
3425
+ return { score };
3426
+ }
3427
+ async function runEvaluatorList(options) {
3428
+ const {
3429
+ evalCase,
3430
+ evaluators,
3431
+ candidate,
3432
+ target,
3433
+ provider,
3434
+ evaluatorRegistry,
3435
+ attempt,
3436
+ promptInputs,
3437
+ now,
3438
+ judgeProvider,
3439
+ agentTimeoutMs
3440
+ } = options;
3441
+ const scored = [];
3442
+ const evaluatorResults = [];
3443
+ for (const evaluator of evaluators ?? []) {
3444
+ try {
3445
+ if (evaluator.type === "llm_judge") {
3446
+ const score2 = await runLlmJudgeEvaluator({
3447
+ config: evaluator,
3448
+ evalCase,
3449
+ candidate,
3450
+ target,
3451
+ provider,
3452
+ evaluatorRegistry,
3453
+ attempt,
3454
+ promptInputs,
3455
+ now,
3456
+ judgeProvider
3457
+ });
3458
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3459
+ evaluatorResults.push({
3460
+ name: evaluator.name,
3461
+ type: evaluator.type,
3462
+ score: score2.score,
3463
+ hits: score2.hits,
3464
+ misses: score2.misses,
3465
+ reasoning: score2.reasoning,
3466
+ evaluator_raw_request: score2.evaluatorRawRequest
3467
+ });
3468
+ continue;
3469
+ }
3470
+ if (evaluator.type === "code") {
3471
+ const codeEvaluator = new CodeEvaluator({
3472
+ script: evaluator.script,
3473
+ cwd: evaluator.resolvedCwd ?? evaluator.cwd,
3474
+ agentTimeoutMs
3475
+ });
3476
+ const score2 = await codeEvaluator.evaluate({
3477
+ evalCase,
3478
+ candidate,
3479
+ target,
3480
+ provider,
3481
+ attempt,
3482
+ promptInputs,
3483
+ now
3484
+ });
3485
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3486
+ evaluatorResults.push({
3487
+ name: evaluator.name,
3488
+ type: evaluator.type,
3489
+ score: score2.score,
3490
+ hits: score2.hits,
3491
+ misses: score2.misses,
3492
+ reasoning: score2.reasoning,
3493
+ evaluator_raw_request: score2.evaluatorRawRequest
3494
+ });
3495
+ continue;
3496
+ }
3497
+ } catch (error) {
3498
+ const message = error instanceof Error ? error.message : String(error);
3499
+ const fallbackScore = {
3500
+ score: 0,
3501
+ hits: [],
3502
+ misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
3503
+ expectedAspectCount: 1,
3504
+ reasoning: message
3505
+ };
3506
+ scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
3507
+ evaluatorResults.push({
3508
+ name: evaluator.name ?? "unknown",
3509
+ type: evaluator.type ?? "unknown",
3510
+ score: 0,
3511
+ hits: [],
3512
+ misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
3513
+ reasoning: message
3514
+ });
3515
+ }
3516
+ }
3517
+ const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
3518
+ const hits = scored.flatMap((entry) => entry.score.hits);
3519
+ const misses = scored.flatMap((entry) => entry.score.misses);
3520
+ const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
3521
+ const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
3522
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
3523
+ const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
3524
+ const score = {
3525
+ score: aggregateScore,
3526
+ hits,
3527
+ misses,
3528
+ expectedAspectCount,
3529
+ reasoning,
3530
+ rawAspects: rawAspects.length > 0 ? rawAspects : void 0
3531
+ };
3532
+ return { score, evaluatorResults };
3533
+ }
3534
+ async function runLlmJudgeEvaluator(options) {
3535
+ const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
3536
+ const customPrompt = await resolveCustomPrompt(config);
3537
+ return evaluatorRegistry.llm_judge.evaluate({
3538
+ evalCase,
3539
+ candidate,
3540
+ target,
3541
+ provider,
3542
+ attempt,
3543
+ promptInputs,
3544
+ now,
3545
+ judgeProvider,
3546
+ systemPrompt: customPrompt,
3547
+ evaluator: config,
3548
+ judgeModel: config.model
3549
+ });
3550
+ }
3551
+ async function resolveCustomPrompt(config) {
3552
+ if (config.promptPath) {
3553
+ try {
3554
+ return await (0, import_promises6.readFile)(config.promptPath, "utf8");
3555
+ } catch (error) {
3556
+ const message = error instanceof Error ? error.message : String(error);
3557
+ console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
3558
+ }
3559
+ }
3560
+ return config.prompt;
3561
+ }
3562
+ function isNonEmptyString2(value) {
3563
+ return typeof value === "string" && value.trim().length > 0;
3564
+ }
2434
3565
  function filterEvalCases(evalCases, evalId) {
2435
3566
  if (!evalId) {
2436
3567
  return evalCases;
2437
3568
  }
2438
3569
  return evalCases.filter((evalCase) => evalCase.id === evalId);
2439
3570
  }
2440
- function buildGraderRegistry(overrides, resolveJudgeProvider) {
2441
- const heuristic = overrides?.heuristic ?? new HeuristicGrader();
2442
- const llmJudge = overrides?.llm_judge ?? new QualityGrader({
3571
+ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
3572
+ const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
2443
3573
  resolveJudgeProvider: async (context) => {
2444
3574
  if (context.judgeProvider) {
2445
3575
  return context.judgeProvider;
@@ -2449,22 +3579,21 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
2449
3579
  });
2450
3580
  return {
2451
3581
  ...overrides,
2452
- heuristic,
2453
3582
  llm_judge: llmJudge
2454
3583
  };
2455
3584
  }
2456
3585
  async function dumpPrompt(directory, evalCase, promptInputs) {
2457
3586
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2458
3587
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
2459
- const filePath = import_node_path5.default.resolve(directory, filename);
2460
- await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
3588
+ const filePath = import_node_path8.default.resolve(directory, filename);
3589
+ await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
2461
3590
  const payload = {
2462
3591
  eval_id: evalCase.id,
2463
3592
  request: promptInputs.request,
2464
3593
  guidelines: promptInputs.guidelines,
2465
3594
  guideline_paths: evalCase.guideline_paths
2466
3595
  };
2467
- await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
3596
+ await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
2468
3597
  }
2469
3598
  function sanitizeFilename(value) {
2470
3599
  if (!value) {
@@ -2474,7 +3603,7 @@ function sanitizeFilename(value) {
2474
3603
  return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
2475
3604
  }
2476
3605
  async function invokeProvider(provider, options) {
2477
- const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
3606
+ const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
2478
3607
  const controller = new AbortController();
2479
3608
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2480
3609
  if (signal) {
@@ -2485,7 +3614,7 @@ async function invokeProvider(provider, options) {
2485
3614
  prompt: promptInputs.request,
2486
3615
  guidelines: promptInputs.guidelines,
2487
3616
  guideline_patterns: evalCase.guideline_patterns,
2488
- attachments: evalCase.file_paths,
3617
+ inputFiles: evalCase.file_paths,
2489
3618
  evalCaseId: evalCase.id,
2490
3619
  attempt,
2491
3620
  metadata: {
@@ -2554,25 +3683,20 @@ function createAgentKernel() {
2554
3683
  }
2555
3684
  // Annotate the CommonJS export names for ESM import in node:
2556
3685
  0 && (module.exports = {
2557
- GRADER_KINDS,
2558
- HeuristicGrader,
2559
- QualityGrader,
3686
+ CodeEvaluator,
3687
+ LlmJudgeEvaluator,
2560
3688
  TEST_MESSAGE_ROLES,
2561
3689
  buildDirectoryChain,
2562
3690
  buildPromptInputs,
2563
3691
  buildSearchRoots,
2564
- calculateHits,
2565
- calculateMisses,
2566
3692
  createAgentKernel,
2567
3693
  createProvider,
2568
3694
  ensureVSCodeSubagents,
2569
- extractAspects,
2570
3695
  extractCodeBlocks,
2571
3696
  fileExists,
2572
3697
  findGitRoot,
2573
3698
  getHitCount,
2574
- isErrorLike,
2575
- isGraderKind,
3699
+ isEvaluatorKind,
2576
3700
  isGuidelineFile,
2577
3701
  isJsonObject,
2578
3702
  isJsonValue,
@@ -2585,7 +3709,6 @@ function createAgentKernel() {
2585
3709
  resolveFileReference,
2586
3710
  resolveTargetDefinition,
2587
3711
  runEvalCase,
2588
- runEvaluation,
2589
- scoreCandidateResponse
3712
+ runEvaluation
2590
3713
  });
2591
3714
  //# sourceMappingURL=index.cjs.map