@agentv/core 0.2.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -30,25 +30,20 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
- GRADER_KINDS: () => GRADER_KINDS,
34
- HeuristicGrader: () => HeuristicGrader,
35
- QualityGrader: () => QualityGrader,
33
+ CodeEvaluator: () => CodeEvaluator,
34
+ LlmJudgeEvaluator: () => LlmJudgeEvaluator,
36
35
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
36
  buildDirectoryChain: () => buildDirectoryChain,
38
37
  buildPromptInputs: () => buildPromptInputs,
39
38
  buildSearchRoots: () => buildSearchRoots,
40
- calculateHits: () => calculateHits,
41
- calculateMisses: () => calculateMisses,
42
39
  createAgentKernel: () => createAgentKernel,
43
40
  createProvider: () => createProvider,
44
41
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
45
- extractAspects: () => extractAspects,
46
42
  extractCodeBlocks: () => extractCodeBlocks,
47
43
  fileExists: () => fileExists,
48
44
  findGitRoot: () => findGitRoot,
49
45
  getHitCount: () => getHitCount,
50
- isErrorLike: () => isErrorLike,
51
- isGraderKind: () => isGraderKind,
46
+ isEvaluatorKind: () => isEvaluatorKind,
52
47
  isGuidelineFile: () => isGuidelineFile,
53
48
  isJsonObject: () => isJsonObject,
54
49
  isJsonValue: () => isJsonValue,
@@ -61,8 +56,7 @@ __export(index_exports, {
61
56
  resolveFileReference: () => resolveFileReference,
62
57
  resolveTargetDefinition: () => resolveTargetDefinition,
63
58
  runEvalCase: () => runEvalCase,
64
- runEvaluation: () => runEvaluation,
65
- scoreCandidateResponse: () => scoreCandidateResponse
59
+ runEvaluation: () => runEvaluation
66
60
  });
67
61
  module.exports = __toCommonJS(index_exports);
68
62
 
@@ -107,11 +101,10 @@ function isTestMessage(value) {
107
101
  }
108
102
  return candidate.content.every(isJsonObject);
109
103
  }
110
- var GRADER_KIND_VALUES = ["heuristic", "llm_judge"];
111
- var GRADER_KINDS = GRADER_KIND_VALUES;
112
- var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
113
- function isGraderKind(value) {
114
- return typeof value === "string" && GRADER_KIND_SET.has(value);
104
+ var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
105
+ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
106
+ function isEvaluatorKind(value) {
107
+ return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
115
108
  }
116
109
  function getHitCount(result) {
117
110
  return result.hits.length;
@@ -325,7 +318,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
325
318
  if (!Array.isArray(rawTestcases)) {
326
319
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
327
320
  }
328
- const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
321
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
329
322
  const results = [];
330
323
  for (const rawEvalcase of rawTestcases) {
331
324
  if (!isJsonObject(rawEvalcase)) {
@@ -448,7 +441,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
448
441
  const assistantContent = assistantMessages[0]?.content;
449
442
  const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
450
443
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
451
- const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
444
+ const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
445
+ const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
452
446
  const userFilePaths = [];
453
447
  for (const segment of userSegments) {
454
448
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -471,7 +465,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
471
465
  file_paths: allFilePaths,
472
466
  code_snippets: codeSnippets,
473
467
  outcome,
474
- grader: testCaseGrader
468
+ evaluator: testCaseEvaluatorKind,
469
+ evaluators
475
470
  };
476
471
  if (verbose) {
477
472
  console.log(`
@@ -632,14 +627,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
632
627
  }
633
628
  return parts.join(" ");
634
629
  }
635
- function coerceGrader(candidate) {
630
+ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
631
+ const execution = rawEvalCase.execution;
632
+ const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
633
+ if (candidateEvaluators === void 0) {
634
+ return void 0;
635
+ }
636
+ if (!Array.isArray(candidateEvaluators)) {
637
+ logWarning(`Skipping evaluators for '${evalId}': expected array`);
638
+ return void 0;
639
+ }
640
+ const evaluators = [];
641
+ for (const rawEvaluator of candidateEvaluators) {
642
+ if (!isJsonObject(rawEvaluator)) {
643
+ logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
644
+ continue;
645
+ }
646
+ const name = asString(rawEvaluator.name);
647
+ const typeValue = rawEvaluator.type;
648
+ if (!name || !isEvaluatorKind(typeValue)) {
649
+ logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
650
+ continue;
651
+ }
652
+ if (typeValue === "code") {
653
+ const script = asString(rawEvaluator.script);
654
+ if (!script) {
655
+ logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
656
+ continue;
657
+ }
658
+ const cwd = asString(rawEvaluator.cwd);
659
+ let resolvedCwd;
660
+ if (cwd) {
661
+ const resolved = await resolveFileReference(cwd, searchRoots);
662
+ if (resolved.resolvedPath) {
663
+ resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
664
+ } else {
665
+ logWarning(
666
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
667
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
668
+ );
669
+ }
670
+ }
671
+ evaluators.push({
672
+ name,
673
+ type: "code",
674
+ script,
675
+ cwd,
676
+ resolvedCwd
677
+ });
678
+ continue;
679
+ }
680
+ const prompt = asString(rawEvaluator.prompt);
681
+ let promptPath;
682
+ if (prompt) {
683
+ const resolved = await resolveFileReference(prompt, searchRoots);
684
+ if (resolved.resolvedPath) {
685
+ promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
686
+ } else {
687
+ logWarning(
688
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
689
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
690
+ );
691
+ }
692
+ }
693
+ const model = asString(rawEvaluator.model);
694
+ evaluators.push({
695
+ name,
696
+ type: "llm_judge",
697
+ prompt,
698
+ promptPath,
699
+ model
700
+ });
701
+ }
702
+ return evaluators.length > 0 ? evaluators : void 0;
703
+ }
704
+ function coerceEvaluator(candidate, contextId) {
636
705
  if (typeof candidate !== "string") {
637
706
  return void 0;
638
707
  }
639
- if (isGraderKind(candidate)) {
708
+ if (isEvaluatorKind(candidate)) {
640
709
  return candidate;
641
710
  }
642
- logWarning(`Unknown grader '${candidate}', falling back to default`);
711
+ logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
643
712
  return void 0;
644
713
  }
645
714
  function logWarning(message, details) {
@@ -835,6 +904,214 @@ var GeminiProvider = class {
835
904
  }
836
905
  };
837
906
 
907
+ // src/evaluation/providers/cli.ts
908
+ var import_node_child_process = require("child_process");
909
+ var import_node_path3 = __toESM(require("path"), 1);
910
+ var import_node_util = require("util");
911
+ var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
912
+ var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
913
+ async function defaultCommandRunner(command, options) {
914
+ const execOptions = {
915
+ cwd: options.cwd,
916
+ env: options.env,
917
+ timeout: options.timeoutMs,
918
+ signal: options.signal,
919
+ maxBuffer: DEFAULT_MAX_BUFFER,
920
+ shell: process.platform === "win32" ? "powershell.exe" : void 0
921
+ };
922
+ try {
923
+ const { stdout, stderr } = await execAsync(command, execOptions);
924
+ return {
925
+ stdout,
926
+ stderr,
927
+ exitCode: 0,
928
+ failed: false,
929
+ timedOut: false,
930
+ signal: null
931
+ };
932
+ } catch (error) {
933
+ const execError = error;
934
+ return {
935
+ stdout: execError.stdout ?? "",
936
+ stderr: execError.stderr ?? "",
937
+ exitCode: typeof execError.code === "number" ? execError.code : null,
938
+ failed: true,
939
+ timedOut: execError.timedOut === true || execError.killed === true,
940
+ signal: execError.signal ?? null
941
+ };
942
+ }
943
+ }
944
+ var CliProvider = class {
945
+ id;
946
+ kind = "cli";
947
+ targetName;
948
+ supportsBatch = false;
949
+ config;
950
+ runCommand;
951
+ healthcheckPromise;
952
+ constructor(targetName, config, runner = defaultCommandRunner) {
953
+ this.targetName = targetName;
954
+ this.id = `cli:${targetName}`;
955
+ this.config = config;
956
+ this.runCommand = runner;
957
+ }
958
+ async invoke(request) {
959
+ if (request.signal?.aborted) {
960
+ throw new Error("CLI provider request was aborted before execution");
961
+ }
962
+ await this.ensureHealthy(request.signal);
963
+ const templateValues = buildTemplateValues(request, this.config);
964
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
965
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
966
+ const result = await this.runCommand(renderedCommand, {
967
+ cwd: this.config.cwd,
968
+ env,
969
+ timeoutMs: this.config.timeoutMs,
970
+ signal: request.signal
971
+ });
972
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
973
+ if (request.signal?.aborted) {
974
+ throw new Error("CLI provider request was aborted");
975
+ }
976
+ if (result.timedOut) {
977
+ throw new Error(
978
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
979
+ );
980
+ }
981
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
982
+ const detail = result.stderr.trim() || result.stdout.trim();
983
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
984
+ throw new Error(message);
985
+ }
986
+ return {
987
+ text: result.stdout,
988
+ raw: {
989
+ command: renderedCommand,
990
+ stderr: result.stderr,
991
+ exitCode: result.exitCode ?? 0,
992
+ cwd: this.config.cwd
993
+ }
994
+ };
995
+ }
996
+ async ensureHealthy(signal) {
997
+ if (!this.config.healthcheck) {
998
+ return;
999
+ }
1000
+ if (!this.healthcheckPromise) {
1001
+ this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
1002
+ }
1003
+ return this.healthcheckPromise;
1004
+ }
1005
+ async runHealthcheck(healthcheck, signal) {
1006
+ if (!healthcheck) {
1007
+ return;
1008
+ }
1009
+ const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
1010
+ if (healthcheck.type === "http") {
1011
+ const controller = new AbortController();
1012
+ const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
1013
+ signal?.addEventListener("abort", () => controller.abort(), { once: true });
1014
+ try {
1015
+ const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
1016
+ if (!response.ok) {
1017
+ throw new Error(`HTTP ${response.status} ${response.statusText}`);
1018
+ }
1019
+ } catch (error) {
1020
+ const reason = error instanceof Error ? error.message : String(error);
1021
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
1022
+ } finally {
1023
+ if (timer !== void 0) {
1024
+ clearTimeout(timer);
1025
+ }
1026
+ }
1027
+ return;
1028
+ }
1029
+ const renderedCommand = renderTemplate(
1030
+ healthcheck.commandTemplate,
1031
+ buildTemplateValues(
1032
+ {
1033
+ prompt: "",
1034
+ guidelines: "",
1035
+ inputFiles: [],
1036
+ evalCaseId: "",
1037
+ attempt: 0
1038
+ },
1039
+ this.config
1040
+ )
1041
+ );
1042
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
1043
+ const result = await this.runCommand(renderedCommand, {
1044
+ cwd: healthcheck.cwd ?? this.config.cwd,
1045
+ env,
1046
+ timeoutMs,
1047
+ signal
1048
+ });
1049
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
1050
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
1051
+ const detail = result.stderr.trim() || result.stdout.trim();
1052
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
1053
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
1054
+ }
1055
+ }
1056
+ };
1057
+ function buildTemplateValues(request, config) {
1058
+ const inputFiles = normalizeInputFiles(request.inputFiles);
1059
+ return {
1060
+ PROMPT: shellEscape(request.prompt ?? ""),
1061
+ GUIDELINES: shellEscape(request.guidelines ?? ""),
1062
+ EVAL_ID: shellEscape(request.evalCaseId ?? ""),
1063
+ ATTEMPT: shellEscape(String(request.attempt ?? 0)),
1064
+ FILES: formatFileList(inputFiles, config.filesFormat)
1065
+ };
1066
+ }
1067
+ function normalizeInputFiles(inputFiles) {
1068
+ if (!inputFiles || inputFiles.length === 0) {
1069
+ return void 0;
1070
+ }
1071
+ const unique = /* @__PURE__ */ new Map();
1072
+ for (const inputFile of inputFiles) {
1073
+ const absolutePath = import_node_path3.default.resolve(inputFile);
1074
+ if (!unique.has(absolutePath)) {
1075
+ unique.set(absolutePath, absolutePath);
1076
+ }
1077
+ }
1078
+ return Array.from(unique.values());
1079
+ }
1080
+ function formatFileList(files, template) {
1081
+ if (!files || files.length === 0) {
1082
+ return "";
1083
+ }
1084
+ const formatter = template ?? "{path}";
1085
+ return files.map((filePath) => {
1086
+ const escapedPath = shellEscape(filePath);
1087
+ const escapedName = shellEscape(import_node_path3.default.basename(filePath));
1088
+ return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
1089
+ }).join(" ");
1090
+ }
1091
+ function renderTemplate(template, values) {
1092
+ return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
1093
+ const replacement = values[key];
1094
+ return replacement !== void 0 ? replacement : match;
1095
+ });
1096
+ }
1097
+ function shellEscape(value) {
1098
+ if (value.length === 0) {
1099
+ return "''";
1100
+ }
1101
+ if (process.platform === "win32") {
1102
+ const escaped = value.replace(/"/g, '\\"');
1103
+ return `"${escaped}"`;
1104
+ }
1105
+ return `'${value.replace(/'/g, `'"'"'`)}'`;
1106
+ }
1107
+ function formatTimeoutSuffix(timeoutMs) {
1108
+ if (!timeoutMs || timeoutMs <= 0) {
1109
+ return "";
1110
+ }
1111
+ const seconds = Math.ceil(timeoutMs / 1e3);
1112
+ return ` after ${seconds}s`;
1113
+ }
1114
+
838
1115
  // src/evaluation/providers/mock.ts
839
1116
  var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
840
1117
  var MockProvider = class {
@@ -878,6 +1155,7 @@ var MockProvider = class {
878
1155
 
879
1156
  // src/evaluation/providers/targets.ts
880
1157
  var import_zod = require("zod");
1158
+ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
881
1159
  var BASE_TARGET_SCHEMA = import_zod.z.object({
882
1160
  name: import_zod.z.string().min(1, "target name is required"),
883
1161
  provider: import_zod.z.string().min(1, "provider is required"),
@@ -900,6 +1178,9 @@ function normalizeAzureApiVersion(value) {
900
1178
  function resolveTargetDefinition(definition, env = process.env) {
901
1179
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
902
1180
  const provider = parsed.provider.toLowerCase();
1181
+ const providerBatching = resolveOptionalBoolean(
1182
+ parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
1183
+ );
903
1184
  switch (provider) {
904
1185
  case "azure":
905
1186
  case "azure-openai":
@@ -908,6 +1189,7 @@ function resolveTargetDefinition(definition, env = process.env) {
908
1189
  name: parsed.name,
909
1190
  judgeTarget: parsed.judge_target,
910
1191
  workers: parsed.workers,
1192
+ providerBatching,
911
1193
  config: resolveAzureConfig(parsed, env)
912
1194
  };
913
1195
  case "anthropic":
@@ -916,6 +1198,7 @@ function resolveTargetDefinition(definition, env = process.env) {
916
1198
  name: parsed.name,
917
1199
  judgeTarget: parsed.judge_target,
918
1200
  workers: parsed.workers,
1201
+ providerBatching,
919
1202
  config: resolveAnthropicConfig(parsed, env)
920
1203
  };
921
1204
  case "gemini":
@@ -926,14 +1209,26 @@ function resolveTargetDefinition(definition, env = process.env) {
926
1209
  name: parsed.name,
927
1210
  judgeTarget: parsed.judge_target,
928
1211
  workers: parsed.workers,
1212
+ providerBatching,
929
1213
  config: resolveGeminiConfig(parsed, env)
930
1214
  };
1215
+ case "codex":
1216
+ case "codex-cli":
1217
+ return {
1218
+ kind: "codex",
1219
+ name: parsed.name,
1220
+ judgeTarget: parsed.judge_target,
1221
+ workers: parsed.workers,
1222
+ providerBatching,
1223
+ config: resolveCodexConfig(parsed, env)
1224
+ };
931
1225
  case "mock":
932
1226
  return {
933
1227
  kind: "mock",
934
1228
  name: parsed.name,
935
1229
  judgeTarget: parsed.judge_target,
936
1230
  workers: parsed.workers,
1231
+ providerBatching,
937
1232
  config: resolveMockConfig(parsed)
938
1233
  };
939
1234
  case "vscode":
@@ -943,8 +1238,18 @@ function resolveTargetDefinition(definition, env = process.env) {
943
1238
  name: parsed.name,
944
1239
  judgeTarget: parsed.judge_target,
945
1240
  workers: parsed.workers,
1241
+ providerBatching,
946
1242
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
947
1243
  };
1244
+ case "cli":
1245
+ return {
1246
+ kind: "cli",
1247
+ name: parsed.name,
1248
+ judgeTarget: parsed.judge_target,
1249
+ workers: parsed.workers,
1250
+ providerBatching,
1251
+ config: resolveCliConfig(parsed, env)
1252
+ };
948
1253
  default:
949
1254
  throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
950
1255
  }
@@ -1012,6 +1317,29 @@ function resolveGeminiConfig(target, env) {
1012
1317
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
1013
1318
  };
1014
1319
  }
1320
+ function resolveCodexConfig(target, env) {
1321
+ const settings = target.settings ?? {};
1322
+ const executableSource = settings.executable ?? settings.command ?? settings.binary;
1323
+ const argsSource = settings.args ?? settings.arguments;
1324
+ const cwdSource = settings.cwd;
1325
+ const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
1326
+ const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
1327
+ allowLiteral: true,
1328
+ optionalEnv: true
1329
+ }) ?? "codex";
1330
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
1331
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
1332
+ allowLiteral: true,
1333
+ optionalEnv: true
1334
+ });
1335
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
1336
+ return {
1337
+ executable,
1338
+ args,
1339
+ cwd,
1340
+ timeoutMs
1341
+ };
1342
+ }
1015
1343
  function resolveMockConfig(target) {
1016
1344
  const settings = target.settings ?? {};
1017
1345
  const response = typeof settings.response === "string" ? settings.response : void 0;
@@ -1041,6 +1369,125 @@ function resolveVSCodeConfig(target, env, insiders) {
1041
1369
  workspaceTemplate
1042
1370
  };
1043
1371
  }
1372
+ function resolveCliConfig(target, env) {
1373
+ const settings = target.settings ?? {};
1374
+ const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
1375
+ const filesFormat = resolveOptionalLiteralString(
1376
+ settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
1377
+ );
1378
+ const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
1379
+ allowLiteral: true,
1380
+ optionalEnv: true
1381
+ });
1382
+ const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
1383
+ const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
1384
+ const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
1385
+ const commandTemplate = resolveString(
1386
+ commandTemplateSource,
1387
+ env,
1388
+ `${target.name} CLI command template`,
1389
+ true
1390
+ );
1391
+ assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
1392
+ return {
1393
+ commandTemplate,
1394
+ filesFormat,
1395
+ cwd,
1396
+ env: envOverrides,
1397
+ timeoutMs,
1398
+ healthcheck
1399
+ };
1400
+ }
1401
+ function resolveEnvOverrides(source, env, targetName) {
1402
+ if (source === void 0 || source === null) {
1403
+ return void 0;
1404
+ }
1405
+ if (typeof source !== "object" || Array.isArray(source)) {
1406
+ throw new Error(`${targetName} env overrides must be an object map of strings`);
1407
+ }
1408
+ const entries = Object.entries(source);
1409
+ const resolved = {};
1410
+ for (const [key, value] of entries) {
1411
+ if (typeof value !== "string") {
1412
+ throw new Error(`${targetName} env override '${key}' must be a string`);
1413
+ }
1414
+ const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
1415
+ resolved[key] = resolvedValue;
1416
+ }
1417
+ return Object.keys(resolved).length > 0 ? resolved : void 0;
1418
+ }
1419
+ function resolveTimeoutMs(source, description) {
1420
+ const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
1421
+ if (seconds === void 0) {
1422
+ return void 0;
1423
+ }
1424
+ if (seconds <= 0) {
1425
+ throw new Error(`${description} must be greater than zero seconds`);
1426
+ }
1427
+ return Math.floor(seconds * 1e3);
1428
+ }
1429
+ function resolveCliHealthcheck(source, env, targetName) {
1430
+ if (source === void 0 || source === null) {
1431
+ return void 0;
1432
+ }
1433
+ if (typeof source !== "object" || Array.isArray(source)) {
1434
+ throw new Error(`${targetName} healthcheck must be an object`);
1435
+ }
1436
+ const candidate = source;
1437
+ const type = candidate.type;
1438
+ const timeoutMs = resolveTimeoutMs(
1439
+ candidate.timeout_seconds ?? candidate.timeoutSeconds,
1440
+ `${targetName} healthcheck timeout`
1441
+ );
1442
+ if (type === "http") {
1443
+ const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
1444
+ return {
1445
+ type: "http",
1446
+ url,
1447
+ timeoutMs
1448
+ };
1449
+ }
1450
+ if (type === "command") {
1451
+ const commandTemplate = resolveString(
1452
+ candidate.command_template ?? candidate.commandTemplate,
1453
+ env,
1454
+ `${targetName} healthcheck command template`,
1455
+ true
1456
+ );
1457
+ assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
1458
+ const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
1459
+ allowLiteral: true,
1460
+ optionalEnv: true
1461
+ });
1462
+ return {
1463
+ type: "command",
1464
+ commandTemplate,
1465
+ timeoutMs,
1466
+ cwd
1467
+ };
1468
+ }
1469
+ throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
1470
+ }
1471
+ function assertSupportedCliPlaceholders(template, description) {
1472
+ const placeholders = extractCliPlaceholders(template);
1473
+ for (const placeholder of placeholders) {
1474
+ if (!CLI_PLACEHOLDERS.has(placeholder)) {
1475
+ throw new Error(
1476
+ `${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
1477
+ );
1478
+ }
1479
+ }
1480
+ }
1481
+ function extractCliPlaceholders(template) {
1482
+ const matches = template.matchAll(/\{([A-Z_]+)\}/g);
1483
+ const results = [];
1484
+ for (const match of matches) {
1485
+ if (match[1]) {
1486
+ results.push(match[1]);
1487
+ }
1488
+ }
1489
+ return results;
1490
+ }
1044
1491
  function resolveString(source, env, description, allowLiteral = false) {
1045
1492
  const value = resolveOptionalString(source, env, description, {
1046
1493
  allowLiteral,
@@ -1071,11 +1518,14 @@ function resolveOptionalString(source, env, description, options) {
1071
1518
  }
1072
1519
  const allowLiteral = options?.allowLiteral ?? false;
1073
1520
  const optionalEnv = options?.optionalEnv ?? false;
1074
- if (!allowLiteral && isLikelyEnvReference(trimmed)) {
1521
+ const looksLikeEnv = isLikelyEnvReference(trimmed);
1522
+ if (looksLikeEnv) {
1075
1523
  if (optionalEnv) {
1076
1524
  return void 0;
1077
1525
  }
1078
- throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
1526
+ if (!allowLiteral) {
1527
+ throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
1528
+ }
1079
1529
  }
1080
1530
  return trimmed;
1081
1531
  }
@@ -1125,15 +1575,48 @@ function resolveOptionalBoolean(source) {
1125
1575
  function isLikelyEnvReference(value) {
1126
1576
  return /^[A-Z0-9_]+$/.test(value);
1127
1577
  }
1578
+ function resolveOptionalStringArray(source, env, description) {
1579
+ if (source === void 0 || source === null) {
1580
+ return void 0;
1581
+ }
1582
+ if (!Array.isArray(source)) {
1583
+ throw new Error(`${description} must be an array of strings`);
1584
+ }
1585
+ if (source.length === 0) {
1586
+ return void 0;
1587
+ }
1588
+ const resolved = [];
1589
+ for (let i = 0; i < source.length; i++) {
1590
+ const item = source[i];
1591
+ if (typeof item !== "string") {
1592
+ throw new Error(`${description}[${i}] must be a string`);
1593
+ }
1594
+ const trimmed = item.trim();
1595
+ if (trimmed.length === 0) {
1596
+ throw new Error(`${description}[${i}] cannot be empty`);
1597
+ }
1598
+ const envValue = env[trimmed];
1599
+ if (envValue !== void 0) {
1600
+ if (envValue.trim().length === 0) {
1601
+ throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
1602
+ }
1603
+ resolved.push(envValue);
1604
+ } else {
1605
+ resolved.push(trimmed);
1606
+ }
1607
+ }
1608
+ return resolved.length > 0 ? resolved : void 0;
1609
+ }
1128
1610
 
1129
1611
  // src/evaluation/providers/vscode.ts
1130
1612
  var import_promises3 = require("fs/promises");
1131
- var import_node_path3 = __toESM(require("path"), 1);
1613
+ var import_node_path4 = __toESM(require("path"), 1);
1132
1614
  var import_subagent = require("subagent");
1133
1615
  var VSCodeProvider = class {
1134
1616
  id;
1135
1617
  kind;
1136
1618
  targetName;
1619
+ supportsBatch = true;
1137
1620
  config;
1138
1621
  constructor(targetName, config, kind) {
1139
1622
  this.id = `${kind}:${targetName}`;
@@ -1145,12 +1628,11 @@ var VSCodeProvider = class {
1145
1628
  if (request.signal?.aborted) {
1146
1629
  throw new Error("VS Code provider request was aborted before dispatch");
1147
1630
  }
1148
- const attachments = normalizeAttachments(request.attachments);
1149
- const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
1631
+ const inputFiles = normalizeAttachments(request.inputFiles);
1632
+ const promptContent = buildPromptDocument(request, inputFiles, request.guideline_patterns);
1150
1633
  const session = await (0, import_subagent.dispatchAgentSession)({
1151
1634
  userQuery: promptContent,
1152
- // Use full prompt content instead of just request.prompt
1153
- extraAttachments: attachments,
1635
+ extraAttachments: inputFiles,
1154
1636
  wait: this.config.waitForResponse,
1155
1637
  dryRun: this.config.dryRun,
1156
1638
  vscodeCmd: this.config.command,
@@ -1167,7 +1649,7 @@ var VSCodeProvider = class {
1167
1649
  text: "",
1168
1650
  raw: {
1169
1651
  session,
1170
- attachments
1652
+ inputFiles
1171
1653
  }
1172
1654
  };
1173
1655
  }
@@ -1176,42 +1658,106 @@ var VSCodeProvider = class {
1176
1658
  text: responseText,
1177
1659
  raw: {
1178
1660
  session,
1179
- attachments
1661
+ inputFiles
1180
1662
  }
1181
1663
  };
1182
1664
  }
1665
+ async invokeBatch(requests) {
1666
+ if (requests.length === 0) {
1667
+ return [];
1668
+ }
1669
+ const normalizedRequests = requests.map((req) => ({
1670
+ request: req,
1671
+ inputFiles: normalizeAttachments(req.inputFiles)
1672
+ }));
1673
+ const combinedInputFiles = mergeAttachments(
1674
+ normalizedRequests.map(({ inputFiles }) => inputFiles)
1675
+ );
1676
+ const userQueries = normalizedRequests.map(
1677
+ ({ request, inputFiles }) => buildPromptDocument(request, inputFiles, request.guideline_patterns)
1678
+ );
1679
+ const session = await (0, import_subagent.dispatchBatchAgent)({
1680
+ userQueries,
1681
+ extraAttachments: combinedInputFiles,
1682
+ wait: this.config.waitForResponse,
1683
+ dryRun: this.config.dryRun,
1684
+ vscodeCmd: this.config.command,
1685
+ subagentRoot: this.config.subagentRoot,
1686
+ workspaceTemplate: this.config.workspaceTemplate,
1687
+ silent: true
1688
+ });
1689
+ if (session.exitCode !== 0 || !session.responseFiles) {
1690
+ const failure = session.error ?? "VS Code subagent did not produce batch responses";
1691
+ throw new Error(failure);
1692
+ }
1693
+ if (this.config.dryRun) {
1694
+ return normalizedRequests.map(({ inputFiles }) => ({
1695
+ text: "",
1696
+ raw: {
1697
+ session,
1698
+ inputFiles,
1699
+ allInputFiles: combinedInputFiles
1700
+ }
1701
+ }));
1702
+ }
1703
+ if (session.responseFiles.length !== requests.length) {
1704
+ throw new Error(
1705
+ `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
1706
+ );
1707
+ }
1708
+ const responses = [];
1709
+ for (const [index, responseFile] of session.responseFiles.entries()) {
1710
+ const responseText = await (0, import_promises3.readFile)(responseFile, "utf8");
1711
+ responses.push({
1712
+ text: responseText,
1713
+ raw: {
1714
+ session,
1715
+ inputFiles: normalizedRequests[index]?.inputFiles,
1716
+ allInputFiles: combinedInputFiles,
1717
+ responseFile
1718
+ }
1719
+ });
1720
+ }
1721
+ return responses;
1722
+ }
1183
1723
  };
1184
1724
  function buildPromptDocument(request, attachments, guidelinePatterns) {
1185
1725
  const parts = [];
1186
1726
  const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
1187
- if (guidelineFiles.length > 0) {
1188
- parts.push("\n", buildMandatoryPrereadBlock(guidelineFiles));
1727
+ const attachmentFiles = collectAttachmentFiles(attachments);
1728
+ const nonGuidelineAttachments = attachmentFiles.filter(
1729
+ (file) => !guidelineFiles.includes(file)
1730
+ );
1731
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
1732
+ if (prereadBlock.length > 0) {
1733
+ parts.push("\n", prereadBlock);
1189
1734
  }
1190
1735
  parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1191
1736
  return parts.join("\n").trim();
1192
1737
  }
1193
- function buildMandatoryPrereadBlock(guidelineFiles) {
1194
- if (guidelineFiles.length === 0) {
1738
+ function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
1739
+ if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
1195
1740
  return "";
1196
1741
  }
1197
- const fileList = [];
1198
- let counter = 0;
1199
- for (const absolutePath of guidelineFiles) {
1200
- counter += 1;
1201
- const fileName = import_node_path3.default.basename(absolutePath);
1742
+ const buildList = (files) => files.map((absolutePath) => {
1743
+ const fileName = import_node_path4.default.basename(absolutePath);
1202
1744
  const fileUri = pathToFileUri(absolutePath);
1203
- fileList.push(`* [${fileName}](${fileUri})`);
1204
- }
1205
- const filesText = fileList.join("\n");
1206
- const instruction = [
1207
- `Read all guideline files:
1208
- ${filesText}.
1209
- `,
1210
- `If any file is missing, fail with ERROR: missing-file <filename> and stop.
1211
- `,
1212
- `Then apply system_instructions on the user query below.`
1213
- ].join("");
1214
- return `${instruction}`;
1745
+ return `* [${fileName}](${fileUri})`;
1746
+ });
1747
+ const sections = [];
1748
+ if (guidelineFiles.length > 0) {
1749
+ sections.push(`Read all guideline files:
1750
+ ${buildList(guidelineFiles).join("\n")}.`);
1751
+ }
1752
+ if (attachmentFiles.length > 0) {
1753
+ sections.push(`Read all attachment files:
1754
+ ${buildList(attachmentFiles).join("\n")}.`);
1755
+ }
1756
+ sections.push(
1757
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1758
+ "Then apply system_instructions on the user query below."
1759
+ );
1760
+ return sections.join("\n");
1215
1761
  }
1216
1762
  function collectGuidelineFiles(attachments, guidelinePatterns) {
1217
1763
  if (!attachments || attachments.length === 0) {
@@ -1219,8 +1765,8 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
1219
1765
  }
1220
1766
  const unique = /* @__PURE__ */ new Map();
1221
1767
  for (const attachment of attachments) {
1222
- const absolutePath = import_node_path3.default.resolve(attachment);
1223
- const normalized = absolutePath.split(import_node_path3.default.sep).join("/");
1768
+ const absolutePath = import_node_path4.default.resolve(attachment);
1769
+ const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
1224
1770
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1225
1771
  if (!unique.has(absolutePath)) {
1226
1772
  unique.set(absolutePath, absolutePath);
@@ -1229,8 +1775,21 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
1229
1775
  }
1230
1776
  return Array.from(unique.values());
1231
1777
  }
1778
+ function collectAttachmentFiles(attachments) {
1779
+ if (!attachments || attachments.length === 0) {
1780
+ return [];
1781
+ }
1782
+ const unique = /* @__PURE__ */ new Map();
1783
+ for (const attachment of attachments) {
1784
+ const absolutePath = import_node_path4.default.resolve(attachment);
1785
+ if (!unique.has(absolutePath)) {
1786
+ unique.set(absolutePath, absolutePath);
1787
+ }
1788
+ }
1789
+ return Array.from(unique.values());
1790
+ }
1232
1791
  function pathToFileUri(filePath) {
1233
- const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
1792
+ const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
1234
1793
  const normalizedPath = absolutePath.replace(/\\/g, "/");
1235
1794
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1236
1795
  return `file:///${normalizedPath}`;
@@ -1243,10 +1802,20 @@ function normalizeAttachments(attachments) {
1243
1802
  }
1244
1803
  const deduped = /* @__PURE__ */ new Set();
1245
1804
  for (const attachment of attachments) {
1246
- deduped.add(import_node_path3.default.resolve(attachment));
1805
+ deduped.add(import_node_path4.default.resolve(attachment));
1247
1806
  }
1248
1807
  return Array.from(deduped);
1249
1808
  }
1809
+ function mergeAttachments(all) {
1810
+ const deduped = /* @__PURE__ */ new Set();
1811
+ for (const list of all) {
1812
+ if (!list) continue;
1813
+ for (const inputFile of list) {
1814
+ deduped.add(import_node_path4.default.resolve(inputFile));
1815
+ }
1816
+ }
1817
+ return deduped.size > 0 ? Array.from(deduped) : void 0;
1818
+ }
1250
1819
  async function ensureVSCodeSubagents(options) {
1251
1820
  const { kind, count, verbose = false } = options;
1252
1821
  const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
@@ -1274,22 +1843,598 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
1274
1843
  provisioned: true,
1275
1844
  message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
1276
1845
  };
1277
- } catch (error) {
1278
- const errorMessage = error instanceof Error ? error.message : String(error);
1279
- if (verbose) {
1280
- console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
1846
+ } catch (error) {
1847
+ const errorMessage = error instanceof Error ? error.message : String(error);
1848
+ if (verbose) {
1849
+ console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
1850
+ }
1851
+ return {
1852
+ provisioned: false,
1853
+ message: `Provisioning failed: ${errorMessage}`
1854
+ };
1855
+ }
1856
+ }
1857
+
1858
+ // src/evaluation/providers/codex.ts
1859
+ var import_node_child_process2 = require("child_process");
1860
+ var import_node_fs3 = require("fs");
1861
+ var import_promises4 = require("fs/promises");
1862
+ var import_node_os = require("os");
1863
+ var import_node_path6 = __toESM(require("path"), 1);
1864
+ var import_node_util2 = require("util");
1865
+
1866
+ // src/evaluation/providers/preread.ts
1867
+ var import_node_path5 = __toESM(require("path"), 1);
1868
+ function buildPromptDocument2(request, inputFiles, options) {
1869
+ const parts = [];
1870
+ const guidelineFiles = collectGuidelineFiles2(
1871
+ inputFiles,
1872
+ options?.guidelinePatterns ?? request.guideline_patterns,
1873
+ options?.guidelineOverrides
1874
+ );
1875
+ const inputFilesList = collectInputFiles(inputFiles);
1876
+ const nonGuidelineInputFiles = inputFilesList.filter(
1877
+ (file) => !guidelineFiles.includes(file)
1878
+ );
1879
+ const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineInputFiles);
1880
+ if (prereadBlock.length > 0) {
1881
+ parts.push("\n", prereadBlock);
1882
+ }
1883
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1884
+ return parts.join("\n").trim();
1885
+ }
1886
+ function normalizeInputFiles2(inputFiles) {
1887
+ if (!inputFiles || inputFiles.length === 0) {
1888
+ return void 0;
1889
+ }
1890
+ const deduped = /* @__PURE__ */ new Map();
1891
+ for (const inputFile of inputFiles) {
1892
+ const absolutePath = import_node_path5.default.resolve(inputFile);
1893
+ if (!deduped.has(absolutePath)) {
1894
+ deduped.set(absolutePath, absolutePath);
1895
+ }
1896
+ }
1897
+ return Array.from(deduped.values());
1898
+ }
1899
+ function collectGuidelineFiles2(inputFiles, guidelinePatterns, overrides) {
1900
+ if (!inputFiles || inputFiles.length === 0) {
1901
+ return [];
1902
+ }
1903
+ const unique = /* @__PURE__ */ new Map();
1904
+ for (const inputFile of inputFiles) {
1905
+ const absolutePath = import_node_path5.default.resolve(inputFile);
1906
+ if (overrides?.has(absolutePath)) {
1907
+ if (!unique.has(absolutePath)) {
1908
+ unique.set(absolutePath, absolutePath);
1909
+ }
1910
+ continue;
1911
+ }
1912
+ const normalized = absolutePath.split(import_node_path5.default.sep).join("/");
1913
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1914
+ if (!unique.has(absolutePath)) {
1915
+ unique.set(absolutePath, absolutePath);
1916
+ }
1917
+ }
1918
+ }
1919
+ return Array.from(unique.values());
1920
+ }
1921
+ function collectInputFiles(inputFiles) {
1922
+ if (!inputFiles || inputFiles.length === 0) {
1923
+ return [];
1924
+ }
1925
+ const unique = /* @__PURE__ */ new Map();
1926
+ for (const inputFile of inputFiles) {
1927
+ const absolutePath = import_node_path5.default.resolve(inputFile);
1928
+ if (!unique.has(absolutePath)) {
1929
+ unique.set(absolutePath, absolutePath);
1930
+ }
1931
+ }
1932
+ return Array.from(unique.values());
1933
+ }
1934
+ function buildMandatoryPrereadBlock2(guidelineFiles, inputFiles) {
1935
+ if (guidelineFiles.length === 0 && inputFiles.length === 0) {
1936
+ return "";
1937
+ }
1938
+ const buildList = (files) => files.map((absolutePath) => {
1939
+ const fileName = import_node_path5.default.basename(absolutePath);
1940
+ const fileUri = pathToFileUri2(absolutePath);
1941
+ return `* [${fileName}](${fileUri})`;
1942
+ });
1943
+ const sections = [];
1944
+ if (guidelineFiles.length > 0) {
1945
+ sections.push(`Read all guideline files:
1946
+ ${buildList(guidelineFiles).join("\n")}.`);
1947
+ }
1948
+ if (inputFiles.length > 0) {
1949
+ sections.push(`Read all input files:
1950
+ ${buildList(inputFiles).join("\n")}.`);
1951
+ }
1952
+ sections.push(
1953
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1954
+ "Then apply system_instructions on the user query below."
1955
+ );
1956
+ return sections.join("\n");
1957
+ }
1958
+ function pathToFileUri2(filePath) {
1959
+ const absolutePath = import_node_path5.default.isAbsolute(filePath) ? filePath : import_node_path5.default.resolve(filePath);
1960
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
1961
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1962
+ return `file:///${normalizedPath}`;
1963
+ }
1964
+ return `file://${normalizedPath}`;
1965
+ }
1966
+
1967
+ // src/evaluation/providers/codex.ts
1968
+ var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
1969
+ var WORKSPACE_PREFIX = "agentv-codex-";
1970
+ var PROMPT_FILENAME = "prompt.md";
1971
+ var FILES_DIR = "files";
1972
+ var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
1973
+ var CodexProvider = class {
1974
+ id;
1975
+ kind = "codex";
1976
+ targetName;
1977
+ supportsBatch = false;
1978
+ config;
1979
+ runCodex;
1980
+ environmentCheck;
1981
+ resolvedExecutable;
1982
+ constructor(targetName, config, runner = defaultCodexRunner) {
1983
+ this.id = `codex:${targetName}`;
1984
+ this.targetName = targetName;
1985
+ this.config = config;
1986
+ this.runCodex = runner;
1987
+ }
1988
+ async invoke(request) {
1989
+ if (request.signal?.aborted) {
1990
+ throw new Error("Codex provider request was aborted before execution");
1991
+ }
1992
+ await this.ensureEnvironmentReady();
1993
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
1994
+ const originalGuidelines = new Set(
1995
+ collectGuidelineFiles2(inputFiles, request.guideline_patterns).map((file) => import_node_path6.default.resolve(file))
1996
+ );
1997
+ const workspaceRoot = await this.createWorkspace();
1998
+ try {
1999
+ const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
2000
+ inputFiles,
2001
+ workspaceRoot,
2002
+ originalGuidelines
2003
+ );
2004
+ const promptContent = buildPromptDocument2(request, mirroredInputFiles, {
2005
+ guidelinePatterns: request.guideline_patterns,
2006
+ guidelineOverrides: guidelineMirrors
2007
+ });
2008
+ const promptFile = import_node_path6.default.join(workspaceRoot, PROMPT_FILENAME);
2009
+ await (0, import_promises4.writeFile)(promptFile, promptContent, "utf8");
2010
+ const args = this.buildCodexArgs();
2011
+ const cwd = this.resolveCwd(workspaceRoot);
2012
+ const result = await this.executeCodex(args, cwd, promptContent, request.signal);
2013
+ if (result.timedOut) {
2014
+ throw new Error(
2015
+ `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
2016
+ );
2017
+ }
2018
+ if (result.exitCode !== 0) {
2019
+ const detail = pickDetail(result.stderr, result.stdout);
2020
+ const prefix = `Codex CLI exited with code ${result.exitCode}`;
2021
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
2022
+ }
2023
+ const parsed = parseCodexJson(result.stdout);
2024
+ const assistantText = extractAssistantText(parsed);
2025
+ return {
2026
+ text: assistantText,
2027
+ raw: {
2028
+ response: parsed,
2029
+ stdout: result.stdout,
2030
+ stderr: result.stderr,
2031
+ exitCode: result.exitCode,
2032
+ args,
2033
+ executable: this.resolvedExecutable ?? this.config.executable,
2034
+ promptFile,
2035
+ workspace: workspaceRoot,
2036
+ inputFiles: mirroredInputFiles
2037
+ }
2038
+ };
2039
+ } finally {
2040
+ await this.cleanupWorkspace(workspaceRoot);
2041
+ }
2042
+ }
2043
+ async ensureEnvironmentReady() {
2044
+ if (!this.environmentCheck) {
2045
+ this.environmentCheck = this.validateEnvironment();
2046
+ }
2047
+ await this.environmentCheck;
2048
+ }
2049
+ async validateEnvironment() {
2050
+ this.resolvedExecutable = await locateExecutable(this.config.executable);
2051
+ }
2052
+ resolveCwd(workspaceRoot) {
2053
+ if (!this.config.cwd) {
2054
+ return workspaceRoot;
2055
+ }
2056
+ return import_node_path6.default.resolve(this.config.cwd);
2057
+ }
2058
+ buildCodexArgs() {
2059
+ const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
2060
+ if (this.config.args && this.config.args.length > 0) {
2061
+ args.push(...this.config.args);
2062
+ }
2063
+ args.push("-");
2064
+ return args;
2065
+ }
2066
+ async executeCodex(args, cwd, promptContent, signal) {
2067
+ try {
2068
+ return await this.runCodex({
2069
+ executable: this.resolvedExecutable ?? this.config.executable,
2070
+ args,
2071
+ cwd,
2072
+ prompt: promptContent,
2073
+ timeoutMs: this.config.timeoutMs,
2074
+ env: process.env,
2075
+ signal
2076
+ });
2077
+ } catch (error) {
2078
+ const err = error;
2079
+ if (err.code === "ENOENT") {
2080
+ throw new Error(
2081
+ `Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
2082
+ );
2083
+ }
2084
+ throw error;
2085
+ }
2086
+ }
2087
+ async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
2088
+ if (!inputFiles || inputFiles.length === 0) {
2089
+ return {
2090
+ mirroredInputFiles: void 0,
2091
+ guidelineMirrors: /* @__PURE__ */ new Set()
2092
+ };
2093
+ }
2094
+ const filesRoot = import_node_path6.default.join(workspaceRoot, FILES_DIR);
2095
+ await (0, import_promises4.mkdir)(filesRoot, { recursive: true });
2096
+ const mirrored = [];
2097
+ const guidelineMirrors = /* @__PURE__ */ new Set();
2098
+ const nameCounts = /* @__PURE__ */ new Map();
2099
+ for (const inputFile of inputFiles) {
2100
+ const absoluteSource = import_node_path6.default.resolve(inputFile);
2101
+ const baseName = import_node_path6.default.basename(absoluteSource);
2102
+ const count = nameCounts.get(baseName) ?? 0;
2103
+ nameCounts.set(baseName, count + 1);
2104
+ const finalName = count === 0 ? baseName : `${baseName}.${count}`;
2105
+ const destination = import_node_path6.default.join(filesRoot, finalName);
2106
+ await (0, import_promises4.copyFile)(absoluteSource, destination);
2107
+ const resolvedDestination = import_node_path6.default.resolve(destination);
2108
+ mirrored.push(resolvedDestination);
2109
+ if (guidelineOriginals.has(absoluteSource)) {
2110
+ guidelineMirrors.add(resolvedDestination);
2111
+ }
2112
+ }
2113
+ return {
2114
+ mirroredInputFiles: mirrored,
2115
+ guidelineMirrors
2116
+ };
2117
+ }
2118
+ async createWorkspace() {
2119
+ return await (0, import_promises4.mkdtemp)(import_node_path6.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
2120
+ }
2121
+ async cleanupWorkspace(workspaceRoot) {
2122
+ try {
2123
+ await (0, import_promises4.rm)(workspaceRoot, { recursive: true, force: true });
2124
+ } catch {
2125
+ }
2126
+ }
2127
+ };
2128
+ async function locateExecutable(candidate) {
2129
+ const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
2130
+ if (includesPathSeparator) {
2131
+ const resolved = import_node_path6.default.isAbsolute(candidate) ? candidate : import_node_path6.default.resolve(candidate);
2132
+ const executablePath = await ensureWindowsExecutableVariant(resolved);
2133
+ await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2134
+ return executablePath;
2135
+ }
2136
+ const locator = process.platform === "win32" ? "where" : "which";
2137
+ try {
2138
+ const { stdout } = await execAsync2(`${locator} ${candidate}`);
2139
+ const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2140
+ const preferred = selectExecutableCandidate(lines);
2141
+ if (preferred) {
2142
+ const executablePath = await ensureWindowsExecutableVariant(preferred);
2143
+ await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2144
+ return executablePath;
2145
+ }
2146
+ } catch {
2147
+ }
2148
+ throw new Error(`Codex executable '${candidate}' was not found on PATH`);
2149
+ }
2150
+ function selectExecutableCandidate(candidates) {
2151
+ if (candidates.length === 0) {
2152
+ return void 0;
2153
+ }
2154
+ if (process.platform !== "win32") {
2155
+ return candidates[0];
2156
+ }
2157
+ const extensions = getWindowsExecutableExtensions();
2158
+ for (const ext of extensions) {
2159
+ const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
2160
+ if (match) {
2161
+ return match;
2162
+ }
2163
+ }
2164
+ return candidates[0];
2165
+ }
2166
+ async function ensureWindowsExecutableVariant(candidate) {
2167
+ if (process.platform !== "win32") {
2168
+ return candidate;
2169
+ }
2170
+ if (hasExecutableExtension(candidate)) {
2171
+ return candidate;
2172
+ }
2173
+ const extensions = getWindowsExecutableExtensions();
2174
+ for (const ext of extensions) {
2175
+ const withExtension = `${candidate}${ext}`;
2176
+ try {
2177
+ await (0, import_promises4.access)(withExtension, import_node_fs3.constants.F_OK);
2178
+ return withExtension;
2179
+ } catch {
2180
+ }
2181
+ }
2182
+ return candidate;
2183
+ }
2184
+ function hasExecutableExtension(candidate) {
2185
+ const lower = candidate.toLowerCase();
2186
+ return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
2187
+ }
2188
+ var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
2189
+ function getWindowsExecutableExtensions() {
2190
+ if (process.platform !== "win32") {
2191
+ return [];
2192
+ }
2193
+ const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
2194
+ return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
2195
+ }
2196
+ function parseCodexJson(output) {
2197
+ const trimmed = output.trim();
2198
+ if (trimmed.length === 0) {
2199
+ throw new Error("Codex CLI produced no output in --json mode");
2200
+ }
2201
+ try {
2202
+ return JSON.parse(trimmed);
2203
+ } catch {
2204
+ const lineObjects = parseJsonLines(trimmed);
2205
+ if (lineObjects) {
2206
+ return lineObjects;
2207
+ }
2208
+ const lastBrace = trimmed.lastIndexOf("{");
2209
+ if (lastBrace >= 0) {
2210
+ const candidate = trimmed.slice(lastBrace);
2211
+ try {
2212
+ return JSON.parse(candidate);
2213
+ } catch {
2214
+ }
2215
+ }
2216
+ const preview = trimmed.slice(0, 200);
2217
+ throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
2218
+ }
2219
+ }
2220
+ function extractAssistantText(parsed) {
2221
+ if (Array.isArray(parsed)) {
2222
+ const text = extractFromEventStream(parsed);
2223
+ if (text) {
2224
+ return text;
2225
+ }
2226
+ }
2227
+ if (!parsed || typeof parsed !== "object") {
2228
+ throw new Error("Codex CLI JSON response did not include an assistant message");
2229
+ }
2230
+ const record = parsed;
2231
+ const eventText = extractFromEvent(record);
2232
+ if (eventText) {
2233
+ return eventText;
2234
+ }
2235
+ const messages = Array.isArray(record.messages) ? record.messages : void 0;
2236
+ if (messages) {
2237
+ for (let index = messages.length - 1; index >= 0; index -= 1) {
2238
+ const entry = messages[index];
2239
+ if (!entry || typeof entry !== "object") {
2240
+ continue;
2241
+ }
2242
+ const role = entry.role;
2243
+ if (role !== "assistant") {
2244
+ continue;
2245
+ }
2246
+ const content = entry.content;
2247
+ const flattened = flattenContent(content);
2248
+ if (flattened) {
2249
+ return flattened;
2250
+ }
2251
+ }
2252
+ }
2253
+ const response = record.response;
2254
+ if (response && typeof response === "object") {
2255
+ const content = response.content;
2256
+ const flattened = flattenContent(content);
2257
+ if (flattened) {
2258
+ return flattened;
2259
+ }
2260
+ }
2261
+ const output = record.output;
2262
+ const flattenedOutput = flattenContent(output);
2263
+ if (flattenedOutput) {
2264
+ return flattenedOutput;
2265
+ }
2266
+ throw new Error("Codex CLI JSON response did not include an assistant message");
2267
+ }
2268
+ function extractFromEventStream(events) {
2269
+ for (let index = events.length - 1; index >= 0; index -= 1) {
2270
+ const candidate = events[index];
2271
+ const text = extractFromEvent(candidate);
2272
+ if (text) {
2273
+ return text;
2274
+ }
2275
+ }
2276
+ return void 0;
2277
+ }
2278
+ function extractFromEvent(event) {
2279
+ if (!event || typeof event !== "object") {
2280
+ return void 0;
2281
+ }
2282
+ const record = event;
2283
+ const type = typeof record.type === "string" ? record.type : void 0;
2284
+ if (type === JSONL_TYPE_ITEM_COMPLETED) {
2285
+ const item = record.item;
2286
+ const text = extractFromItem(item);
2287
+ if (text) {
2288
+ return text;
2289
+ }
2290
+ }
2291
+ const output = record.output ?? record.content;
2292
+ const flattened = flattenContent(output);
2293
+ if (flattened) {
2294
+ return flattened;
2295
+ }
2296
+ return void 0;
2297
+ }
2298
+ function extractFromItem(item) {
2299
+ if (!item || typeof item !== "object") {
2300
+ return void 0;
2301
+ }
2302
+ const record = item;
2303
+ const itemType = typeof record.type === "string" ? record.type : void 0;
2304
+ if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
2305
+ const text = flattenContent(record.text ?? record.content ?? record.output);
2306
+ if (text) {
2307
+ return text;
2308
+ }
2309
+ }
2310
+ return void 0;
2311
+ }
2312
+ function flattenContent(value) {
2313
+ if (typeof value === "string") {
2314
+ return value;
2315
+ }
2316
+ if (Array.isArray(value)) {
2317
+ const parts = value.map((segment) => {
2318
+ if (typeof segment === "string") {
2319
+ return segment;
2320
+ }
2321
+ if (segment && typeof segment === "object" && "text" in segment) {
2322
+ const text = segment.text;
2323
+ return typeof text === "string" ? text : void 0;
2324
+ }
2325
+ return void 0;
2326
+ }).filter((part) => typeof part === "string" && part.length > 0);
2327
+ return parts.length > 0 ? parts.join(" \n") : void 0;
2328
+ }
2329
+ if (value && typeof value === "object" && "text" in value) {
2330
+ const text = value.text;
2331
+ return typeof text === "string" ? text : void 0;
2332
+ }
2333
+ return void 0;
2334
+ }
2335
+ function parseJsonLines(output) {
2336
+ const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2337
+ if (lines.length <= 1) {
2338
+ return void 0;
2339
+ }
2340
+ const parsed = [];
2341
+ for (const line of lines) {
2342
+ try {
2343
+ parsed.push(JSON.parse(line));
2344
+ } catch {
2345
+ return void 0;
2346
+ }
2347
+ }
2348
+ return parsed;
2349
+ }
2350
+ function pickDetail(stderr, stdout) {
2351
+ const errorText = stderr.trim();
2352
+ if (errorText.length > 0) {
2353
+ return errorText;
2354
+ }
2355
+ const stdoutText = stdout.trim();
2356
+ return stdoutText.length > 0 ? stdoutText : void 0;
2357
+ }
2358
+ function formatTimeoutSuffix2(timeoutMs) {
2359
+ if (!timeoutMs || timeoutMs <= 0) {
2360
+ return "";
2361
+ }
2362
+ const seconds = Math.ceil(timeoutMs / 1e3);
2363
+ return ` after ${seconds}s`;
2364
+ }
2365
+ async function defaultCodexRunner(options) {
2366
+ return await new Promise((resolve, reject) => {
2367
+ const child = (0, import_node_child_process2.spawn)(options.executable, options.args, {
2368
+ cwd: options.cwd,
2369
+ env: options.env,
2370
+ stdio: ["pipe", "pipe", "pipe"],
2371
+ shell: shouldShellExecute(options.executable)
2372
+ });
2373
+ let stdout = "";
2374
+ let stderr = "";
2375
+ let timedOut = false;
2376
+ const onAbort = () => {
2377
+ child.kill("SIGTERM");
2378
+ };
2379
+ if (options.signal) {
2380
+ if (options.signal.aborted) {
2381
+ onAbort();
2382
+ } else {
2383
+ options.signal.addEventListener("abort", onAbort, { once: true });
2384
+ }
1281
2385
  }
1282
- return {
1283
- provisioned: false,
1284
- message: `Provisioning failed: ${errorMessage}`
2386
+ let timeoutHandle;
2387
+ if (options.timeoutMs && options.timeoutMs > 0) {
2388
+ timeoutHandle = setTimeout(() => {
2389
+ timedOut = true;
2390
+ child.kill("SIGTERM");
2391
+ }, options.timeoutMs);
2392
+ timeoutHandle.unref?.();
2393
+ }
2394
+ child.stdout.setEncoding("utf8");
2395
+ child.stdout.on("data", (chunk) => {
2396
+ stdout += chunk;
2397
+ });
2398
+ child.stderr.setEncoding("utf8");
2399
+ child.stderr.on("data", (chunk) => {
2400
+ stderr += chunk;
2401
+ });
2402
+ child.stdin.end(options.prompt);
2403
+ const cleanup = () => {
2404
+ if (timeoutHandle) {
2405
+ clearTimeout(timeoutHandle);
2406
+ }
2407
+ if (options.signal) {
2408
+ options.signal.removeEventListener("abort", onAbort);
2409
+ }
1285
2410
  };
2411
+ child.on("error", (error) => {
2412
+ cleanup();
2413
+ reject(error);
2414
+ });
2415
+ child.on("close", (code) => {
2416
+ cleanup();
2417
+ resolve({
2418
+ stdout,
2419
+ stderr,
2420
+ exitCode: typeof code === "number" ? code : -1,
2421
+ timedOut
2422
+ });
2423
+ });
2424
+ });
2425
+ }
2426
+ function shouldShellExecute(executable) {
2427
+ if (process.platform !== "win32") {
2428
+ return false;
1286
2429
  }
2430
+ const lower = executable.toLowerCase();
2431
+ return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
1287
2432
  }
1288
2433
 
1289
2434
  // src/evaluation/providers/targets-file.ts
1290
- var import_node_fs3 = require("fs");
1291
- var import_promises4 = require("fs/promises");
1292
- var import_node_path4 = __toESM(require("path"), 1);
2435
+ var import_node_fs4 = require("fs");
2436
+ var import_promises5 = require("fs/promises");
2437
+ var import_node_path7 = __toESM(require("path"), 1);
1293
2438
  var import_yaml2 = require("yaml");
1294
2439
 
1295
2440
  // src/evaluation/providers/types.ts
@@ -1350,18 +2495,18 @@ function assertTargetDefinition(value, index, filePath) {
1350
2495
  }
1351
2496
  async function fileExists3(filePath) {
1352
2497
  try {
1353
- await (0, import_promises4.access)(filePath, import_node_fs3.constants.F_OK);
2498
+ await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
1354
2499
  return true;
1355
2500
  } catch {
1356
2501
  return false;
1357
2502
  }
1358
2503
  }
1359
2504
  async function readTargetDefinitions(filePath) {
1360
- const absolutePath = import_node_path4.default.resolve(filePath);
2505
+ const absolutePath = import_node_path7.default.resolve(filePath);
1361
2506
  if (!await fileExists3(absolutePath)) {
1362
2507
  throw new Error(`targets.yaml not found at ${absolutePath}`);
1363
2508
  }
1364
- const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
2509
+ const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
1365
2510
  const parsed = (0, import_yaml2.parse)(raw);
1366
2511
  if (!isRecord(parsed)) {
1367
2512
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -1384,6 +2529,10 @@ function createProvider(target) {
1384
2529
  return new AnthropicProvider(target.name, target.config);
1385
2530
  case "gemini":
1386
2531
  return new GeminiProvider(target.name, target.config);
2532
+ case "cli":
2533
+ return new CliProvider(target.name, target.config);
2534
+ case "codex":
2535
+ return new CodexProvider(target.name, target.config);
1387
2536
  case "mock":
1388
2537
  return new MockProvider(target.name, target.config);
1389
2538
  case "vscode":
@@ -1400,230 +2549,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
1400
2549
  return createProvider(resolved);
1401
2550
  }
1402
2551
 
1403
- // src/evaluation/scoring.ts
1404
- var KEY_TERM_MATCH_THRESHOLD = 0.5;
1405
- var ACTION_WORDS = /* @__PURE__ */ new Set([
1406
- "use",
1407
- "avoid",
1408
- "prefer",
1409
- "replace",
1410
- "consider",
1411
- "ensure",
1412
- "remove",
1413
- "add"
1414
- ]);
1415
- var STOP_WORDS = /* @__PURE__ */ new Set([
1416
- "the",
1417
- "a",
1418
- "an",
1419
- "and",
1420
- "or",
1421
- "but",
1422
- "in",
1423
- "on",
1424
- "at",
1425
- "to",
1426
- "for",
1427
- "of",
1428
- "with",
1429
- "by",
1430
- "is",
1431
- "are",
1432
- "was",
1433
- "were",
1434
- "be",
1435
- "been",
1436
- "being",
1437
- "have",
1438
- "has",
1439
- "had",
1440
- "do",
1441
- "does",
1442
- "did",
1443
- "will",
1444
- "would",
1445
- "could",
1446
- "should"
1447
- ]);
1448
- var ERROR_PREFIXES = [
1449
- "error:",
1450
- "err:",
1451
- "vs code command failed",
1452
- "exception",
1453
- "traceback",
1454
- "no response file was generated",
1455
- "timed out",
1456
- "cli not found"
1457
- ];
1458
- function extractAspects(expectedResponse) {
1459
- const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
1460
- const aspects = [];
1461
- for (const line of lines) {
1462
- if (line.length === 0) {
1463
- continue;
1464
- }
1465
- const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
1466
- if (bulletMatch) {
1467
- const normalized = normalizeAspect(bulletMatch[2]);
1468
- if (normalized.length > 0) {
1469
- aspects.push(normalized);
1470
- }
1471
- continue;
1472
- }
1473
- const lowered = line.toLowerCase();
1474
- if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
1475
- const normalized = normalizeAspect(line);
1476
- if (normalized.length > 0) {
1477
- aspects.push(normalized);
1478
- }
1479
- }
1480
- }
1481
- return aspects;
1482
- }
1483
- function calculateHits(candidateResponse, expectedAspects) {
1484
- const { normalizedText, words } = normalizeCandidate(candidateResponse);
1485
- const hits = [];
1486
- for (const aspect of expectedAspects) {
1487
- if (matchesAspect(aspect, normalizedText, words)) {
1488
- hits.push(aspect);
1489
- }
1490
- }
1491
- return hits;
1492
- }
1493
- function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
1494
- const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
1495
- return expectedAspects.filter((aspect) => !hits.has(aspect));
1496
- }
1497
- function scoreCandidateResponse(candidateResponse, expectedAspects) {
1498
- if (expectedAspects.length === 0) {
1499
- if (isErrorLike(candidateResponse)) {
1500
- return {
1501
- score: 0,
1502
- hits: [],
1503
- misses: ["Model produced an error instead of an answer."],
1504
- hitCount: 0,
1505
- totalAspects: 0,
1506
- rawAspects: []
1507
- };
1508
- }
1509
- return {
1510
- score: 1,
1511
- hits: [],
1512
- misses: [],
1513
- hitCount: 0,
1514
- totalAspects: 0,
1515
- rawAspects: []
1516
- };
1517
- }
1518
- const hits = calculateHits(candidateResponse, expectedAspects);
1519
- const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
1520
- const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
1521
- return {
1522
- score,
1523
- hits,
1524
- misses,
1525
- hitCount: hits.length,
1526
- totalAspects: expectedAspects.length,
1527
- rawAspects: expectedAspects
1528
- };
1529
- }
1530
- function isErrorLike(text) {
1531
- if (!text) {
1532
- return false;
1533
- }
1534
- const lowered = text.trim().toLowerCase();
1535
- return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
1536
- }
1537
- function normalizeAspect(aspect) {
1538
- const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
1539
- return sanitized;
1540
- }
1541
- function normalizeCandidate(candidate) {
1542
- const lowered = candidate.toLowerCase();
1543
- const normalizedText = lowered.replace(/[^\w\s]/g, " ");
1544
- const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
1545
- return { normalizedText, words };
1546
- }
1547
- function matchesAspect(aspect, candidateNormalized, candidateWords) {
1548
- const keyTerms = extractKeyTerms(aspect);
1549
- if (keyTerms.length === 0) {
1550
- return false;
1551
- }
1552
- const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
1553
- const ratio = matches / keyTerms.length;
1554
- if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
1555
- return true;
1556
- }
1557
- const aspectWords = aspect.split(" ");
1558
- if (aspectWords.length >= 2) {
1559
- for (let index = 0; index < aspectWords.length - 1; index += 1) {
1560
- const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
1561
- if (candidateNormalized.includes(phrase)) {
1562
- return true;
1563
- }
1564
- }
1565
- }
1566
- return false;
1567
- }
1568
- function extractKeyTerms(aspect, maxTerms = 5) {
1569
- const terms = [];
1570
- const words = aspect.split(" ");
1571
- for (const word of words) {
1572
- if (word.length <= 2) {
1573
- continue;
1574
- }
1575
- if (STOP_WORDS.has(word)) {
1576
- continue;
1577
- }
1578
- terms.push(word);
1579
- if (terms.length >= maxTerms) {
1580
- break;
1581
- }
1582
- }
1583
- return terms;
1584
- }
1585
-
1586
- // src/evaluation/grading.ts
2552
+ // src/evaluation/evaluators.ts
1587
2553
  var import_node_crypto = require("crypto");
1588
- var HeuristicGrader = class {
1589
- kind = "heuristic";
1590
- grade(context) {
1591
- const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1592
- const result = scoreCandidateResponse(context.candidate, expectedAspects);
1593
- const misses = [...result.misses];
1594
- if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
1595
- const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
1596
- if (firstLine && !misses.includes(firstLine)) {
1597
- misses.unshift(firstLine);
1598
- }
1599
- }
1600
- return {
1601
- score: result.score,
1602
- hits: result.hits,
1603
- misses,
1604
- expectedAspectCount: result.totalAspects,
1605
- rawAspects: result.rawAspects
1606
- };
1607
- }
1608
- };
1609
- var QualityGrader = class {
2554
+ var LlmJudgeEvaluator = class {
1610
2555
  kind = "llm_judge";
1611
2556
  resolveJudgeProvider;
1612
2557
  maxOutputTokens;
1613
2558
  temperature;
2559
+ customPrompt;
1614
2560
  constructor(options) {
1615
2561
  this.resolveJudgeProvider = options.resolveJudgeProvider;
1616
2562
  this.maxOutputTokens = options.maxOutputTokens;
1617
2563
  this.temperature = options.temperature;
2564
+ this.customPrompt = options.customPrompt;
1618
2565
  }
1619
- async grade(context) {
2566
+ async evaluate(context) {
1620
2567
  const judgeProvider = await this.resolveJudgeProvider(context);
1621
2568
  if (!judgeProvider) {
1622
2569
  throw new Error("No judge provider available for LLM grading");
1623
2570
  }
1624
2571
  const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2572
+ const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
1625
2573
  const metadata = {
1626
- systemPrompt: QUALITY_SYSTEM_PROMPT
2574
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2575
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1627
2576
  };
1628
2577
  const response = await judgeProvider.invoke({
1629
2578
  prompt,
@@ -1638,12 +2587,13 @@ var QualityGrader = class {
1638
2587
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
1639
2588
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
1640
2589
  const reasoning = parsed.reasoning ?? response.reasoning;
1641
- const graderRawRequest = {
2590
+ const evaluatorRawRequest = {
1642
2591
  id: (0, import_node_crypto.randomUUID)(),
1643
2592
  provider: judgeProvider.id,
1644
2593
  prompt,
1645
- systemPrompt: QUALITY_SYSTEM_PROMPT,
1646
- target: context.target.name
2594
+ target: context.target.name,
2595
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2596
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1647
2597
  };
1648
2598
  return {
1649
2599
  score,
@@ -1651,7 +2601,7 @@ var QualityGrader = class {
1651
2601
  misses,
1652
2602
  expectedAspectCount: hits.length + misses.length || 1,
1653
2603
  reasoning,
1654
- graderRawRequest
2604
+ evaluatorRawRequest
1655
2605
  };
1656
2606
  }
1657
2607
  };
@@ -1769,11 +2719,117 @@ function extractJsonBlob(text) {
1769
2719
  function isNonEmptyString(value) {
1770
2720
  return typeof value === "string" && value.trim().length > 0;
1771
2721
  }
2722
+ var CodeEvaluator = class {
2723
+ kind = "code";
2724
+ script;
2725
+ cwd;
2726
+ agentTimeoutMs;
2727
+ constructor(options) {
2728
+ this.script = options.script;
2729
+ this.cwd = options.cwd;
2730
+ this.agentTimeoutMs = options.agentTimeoutMs;
2731
+ }
2732
+ async evaluate(context) {
2733
+ const inputPayload = JSON.stringify(
2734
+ {
2735
+ task: context.evalCase.task,
2736
+ outcome: context.evalCase.outcome,
2737
+ expected: context.evalCase.expected_assistant_raw,
2738
+ output: context.candidate,
2739
+ system_message: context.promptInputs.systemMessage ?? "",
2740
+ guideline_paths: context.evalCase.guideline_paths,
2741
+ attachments: context.evalCase.file_paths,
2742
+ user_segments: context.evalCase.user_segments
2743
+ },
2744
+ null,
2745
+ 2
2746
+ );
2747
+ try {
2748
+ const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
2749
+ const parsed = parseJsonSafe(stdout);
2750
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
2751
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
2752
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
2753
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
2754
+ return {
2755
+ score,
2756
+ hits,
2757
+ misses,
2758
+ expectedAspectCount: hits.length + misses.length || 1,
2759
+ reasoning,
2760
+ evaluatorRawRequest: {
2761
+ script: this.script,
2762
+ ...this.cwd ? { cwd: this.cwd } : {}
2763
+ }
2764
+ };
2765
+ } catch (error) {
2766
+ const message = error instanceof Error ? error.message : String(error);
2767
+ return {
2768
+ score: 0,
2769
+ hits: [],
2770
+ misses: [`Code evaluator failed: ${message}`],
2771
+ expectedAspectCount: 1,
2772
+ reasoning: message,
2773
+ evaluatorRawRequest: {
2774
+ script: this.script,
2775
+ ...this.cwd ? { cwd: this.cwd } : {},
2776
+ error: message
2777
+ }
2778
+ };
2779
+ }
2780
+ }
2781
+ };
2782
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
2783
+ const { spawn: spawn2 } = await import("child_process");
2784
+ return await new Promise((resolve, reject) => {
2785
+ const child = spawn2(scriptPath, {
2786
+ shell: true,
2787
+ cwd
2788
+ });
2789
+ let stdout = "";
2790
+ let stderr = "";
2791
+ const timeout = agentTimeoutMs ? setTimeout(() => {
2792
+ child.kill();
2793
+ reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
2794
+ }, agentTimeoutMs) : void 0;
2795
+ child.stdout?.on("data", (data) => {
2796
+ stdout += data.toString();
2797
+ });
2798
+ child.stderr?.on("data", (data) => {
2799
+ stderr += data.toString();
2800
+ });
2801
+ child.on("error", (error) => {
2802
+ if (timeout !== void 0) {
2803
+ clearTimeout(timeout);
2804
+ }
2805
+ reject(error);
2806
+ });
2807
+ child.on("exit", (code) => {
2808
+ if (timeout !== void 0) {
2809
+ clearTimeout(timeout);
2810
+ }
2811
+ if (code && code !== 0 && stderr.length > 0) {
2812
+ reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
2813
+ return;
2814
+ }
2815
+ resolve(stdout.trim());
2816
+ });
2817
+ child.stdin?.write(input);
2818
+ child.stdin?.end();
2819
+ });
2820
+ }
2821
+ function parseJsonSafe(payload) {
2822
+ try {
2823
+ return JSON.parse(payload);
2824
+ } catch {
2825
+ return void 0;
2826
+ }
2827
+ }
1772
2828
 
1773
2829
  // src/evaluation/orchestrator.ts
1774
2830
  var import_node_crypto2 = require("crypto");
1775
- var import_promises5 = require("fs/promises");
1776
- var import_node_path5 = __toESM(require("path"), 1);
2831
+ var import_promises6 = require("fs/promises");
2832
+ var import_node_path8 = __toESM(require("path"), 1);
1777
2833
 
1778
2834
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
1779
2835
  var Node = class {
@@ -1920,7 +2976,7 @@ async function runEvaluation(options) {
1920
2976
  targets,
1921
2977
  env,
1922
2978
  providerFactory,
1923
- graders,
2979
+ evaluators,
1924
2980
  maxRetries,
1925
2981
  agentTimeoutMs,
1926
2982
  promptDumpDir,
@@ -1979,8 +3035,14 @@ async function runEvaluation(options) {
1979
3035
  }
1980
3036
  return getOrCreateProvider(resolvedJudge);
1981
3037
  };
1982
- const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
3038
+ const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
1983
3039
  const primaryProvider = getOrCreateProvider(target);
3040
+ const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
3041
+ if (target.providerBatching && !providerSupportsBatch && verbose) {
3042
+ console.warn(
3043
+ `Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
3044
+ );
3045
+ }
1984
3046
  if (onProgress && filteredEvalCases.length > 0) {
1985
3047
  for (let i = 0; i < filteredEvalCases.length; i++) {
1986
3048
  await onProgress({
@@ -1990,6 +3052,28 @@ async function runEvaluation(options) {
1990
3052
  });
1991
3053
  }
1992
3054
  }
3055
+ if (providerSupportsBatch) {
3056
+ try {
3057
+ return await runBatchEvaluation({
3058
+ evalCases: filteredEvalCases,
3059
+ provider: primaryProvider,
3060
+ target,
3061
+ evaluatorRegistry,
3062
+ promptDumpDir,
3063
+ nowFn: now ?? (() => /* @__PURE__ */ new Date()),
3064
+ onProgress,
3065
+ onResult,
3066
+ verbose,
3067
+ resolveJudgeProvider,
3068
+ agentTimeoutMs
3069
+ });
3070
+ } catch (error) {
3071
+ if (verbose) {
3072
+ const message = error instanceof Error ? error.message : String(error);
3073
+ console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
3074
+ }
3075
+ }
3076
+ }
1993
3077
  const workers = options.maxConcurrency ?? target.workers ?? 1;
1994
3078
  const limit = pLimit(workers);
1995
3079
  let nextWorkerId = 1;
@@ -2012,7 +3096,7 @@ async function runEvaluation(options) {
2012
3096
  evalCase,
2013
3097
  provider: primaryProvider,
2014
3098
  target,
2015
- graders: graderRegistry,
3099
+ evaluators: evaluatorRegistry,
2016
3100
  maxRetries,
2017
3101
  agentTimeoutMs,
2018
3102
  promptDumpDir,
@@ -2073,12 +3157,118 @@ async function runEvaluation(options) {
2073
3157
  }
2074
3158
  return results;
2075
3159
  }
3160
+ async function runBatchEvaluation(options) {
3161
+ const {
3162
+ evalCases,
3163
+ provider,
3164
+ target,
3165
+ evaluatorRegistry,
3166
+ promptDumpDir,
3167
+ nowFn,
3168
+ onProgress,
3169
+ onResult,
3170
+ resolveJudgeProvider,
3171
+ agentTimeoutMs
3172
+ } = options;
3173
+ const promptInputsList = [];
3174
+ for (const evalCase of evalCases) {
3175
+ const promptInputs = await buildPromptInputs(evalCase);
3176
+ if (promptDumpDir) {
3177
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
3178
+ }
3179
+ promptInputsList.push(promptInputs);
3180
+ }
3181
+ const batchRequests = evalCases.map((evalCase, index) => {
3182
+ const promptInputs = promptInputsList[index];
3183
+ return {
3184
+ prompt: promptInputs.request,
3185
+ guidelines: promptInputs.guidelines,
3186
+ guideline_patterns: evalCase.guideline_patterns,
3187
+ inputFiles: evalCase.file_paths,
3188
+ evalCaseId: evalCase.id,
3189
+ metadata: {
3190
+ systemPrompt: promptInputs.systemMessage ?? ""
3191
+ }
3192
+ };
3193
+ });
3194
+ const batchResponse = await provider.invokeBatch?.(batchRequests);
3195
+ if (!Array.isArray(batchResponse)) {
3196
+ throw new Error("Provider batching failed: invokeBatch did not return an array");
3197
+ }
3198
+ if (batchResponse.length !== evalCases.length) {
3199
+ throw new Error(
3200
+ `Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
3201
+ );
3202
+ }
3203
+ if (onProgress) {
3204
+ const startedAt = Date.now();
3205
+ for (let i = 0; i < evalCases.length; i++) {
3206
+ await onProgress({
3207
+ workerId: 1,
3208
+ evalId: evalCases[i].id,
3209
+ status: "running",
3210
+ startedAt
3211
+ });
3212
+ }
3213
+ }
3214
+ const results = [];
3215
+ for (let i = 0; i < evalCases.length; i++) {
3216
+ const evalCase = evalCases[i];
3217
+ const promptInputs = promptInputsList[i];
3218
+ const providerResponse = batchResponse[i];
3219
+ let result;
3220
+ try {
3221
+ result = await evaluateCandidate({
3222
+ evalCase,
3223
+ candidate: providerResponse.text ?? "",
3224
+ target,
3225
+ provider,
3226
+ evaluators: evaluatorRegistry,
3227
+ promptInputs,
3228
+ nowFn,
3229
+ attempt: 0,
3230
+ judgeProvider: await resolveJudgeProvider(target),
3231
+ agentTimeoutMs
3232
+ });
3233
+ } catch (error) {
3234
+ const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3235
+ results.push(errorResult);
3236
+ if (onResult) {
3237
+ await onResult(errorResult);
3238
+ }
3239
+ if (onProgress) {
3240
+ await onProgress({
3241
+ workerId: 1,
3242
+ evalId: evalCase.id,
3243
+ status: "failed",
3244
+ completedAt: Date.now(),
3245
+ error: error instanceof Error ? error.message : String(error)
3246
+ });
3247
+ }
3248
+ continue;
3249
+ }
3250
+ results.push(result);
3251
+ if (onResult) {
3252
+ await onResult(result);
3253
+ }
3254
+ if (onProgress) {
3255
+ await onProgress({
3256
+ workerId: 1,
3257
+ evalId: evalCase.id,
3258
+ status: "completed",
3259
+ startedAt: 0,
3260
+ completedAt: Date.now()
3261
+ });
3262
+ }
3263
+ }
3264
+ return results;
3265
+ }
2076
3266
  async function runEvalCase(options) {
2077
3267
  const {
2078
3268
  evalCase,
2079
3269
  provider,
2080
3270
  target,
2081
- graders,
3271
+ evaluators,
2082
3272
  now,
2083
3273
  maxRetries,
2084
3274
  agentTimeoutMs,
@@ -2133,27 +3323,49 @@ async function runEvalCase(options) {
2133
3323
  if (cacheKey && cache && !cachedResponse) {
2134
3324
  await cache.set(cacheKey, providerResponse);
2135
3325
  }
2136
- const graderKind = evalCase.grader ?? "heuristic";
2137
- const activeGrader = graders[graderKind] ?? graders.heuristic;
2138
- if (!activeGrader) {
2139
- throw new Error(`No grader registered for kind '${graderKind}'`);
2140
- }
2141
- let grade;
2142
3326
  try {
2143
- const gradeTimestamp = nowFn();
2144
- grade = await activeGrader.grade({
3327
+ return await evaluateCandidate({
2145
3328
  evalCase,
2146
3329
  candidate: providerResponse.text ?? "",
2147
3330
  target,
2148
3331
  provider,
2149
- attempt,
3332
+ evaluators,
2150
3333
  promptInputs,
2151
- now: gradeTimestamp,
2152
- judgeProvider
3334
+ nowFn,
3335
+ attempt,
3336
+ judgeProvider,
3337
+ agentTimeoutMs
2153
3338
  });
2154
3339
  } catch (error) {
2155
3340
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2156
3341
  }
3342
+ }
3343
+ async function evaluateCandidate(options) {
3344
+ const {
3345
+ evalCase,
3346
+ candidate,
3347
+ target,
3348
+ provider,
3349
+ evaluators,
3350
+ promptInputs,
3351
+ nowFn,
3352
+ attempt,
3353
+ judgeProvider,
3354
+ agentTimeoutMs
3355
+ } = options;
3356
+ const gradeTimestamp = nowFn();
3357
+ const { score, evaluatorResults } = await runEvaluatorsForCase({
3358
+ evalCase,
3359
+ candidate,
3360
+ target,
3361
+ provider,
3362
+ evaluators,
3363
+ attempt,
3364
+ promptInputs,
3365
+ now: gradeTimestamp,
3366
+ judgeProvider,
3367
+ agentTimeoutMs
3368
+ });
2157
3369
  const completedAt = nowFn();
2158
3370
  const rawRequest = {
2159
3371
  request: promptInputs.request,
@@ -2164,28 +3376,200 @@ async function runEvalCase(options) {
2164
3376
  return {
2165
3377
  eval_id: evalCase.id,
2166
3378
  conversation_id: evalCase.conversation_id,
2167
- score: grade.score,
2168
- hits: grade.hits,
2169
- misses: grade.misses,
2170
- model_answer: providerResponse.text ?? "",
2171
- expected_aspect_count: grade.expectedAspectCount,
3379
+ score: score.score,
3380
+ hits: score.hits,
3381
+ misses: score.misses,
3382
+ model_answer: candidate,
3383
+ expected_aspect_count: score.expectedAspectCount,
2172
3384
  target: target.name,
2173
3385
  timestamp: completedAt.toISOString(),
2174
- reasoning: grade.reasoning,
2175
- raw_aspects: grade.rawAspects,
3386
+ reasoning: score.reasoning,
3387
+ raw_aspects: score.rawAspects,
2176
3388
  raw_request: rawRequest,
2177
- grader_raw_request: grade.graderRawRequest
3389
+ evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3390
+ evaluator_results: evaluatorResults
2178
3391
  };
2179
3392
  }
3393
+ async function runEvaluatorsForCase(options) {
3394
+ const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
3395
+ if (evalCase.evaluators && evalCase.evaluators.length > 0) {
3396
+ return runEvaluatorList({
3397
+ evalCase,
3398
+ evaluators: evalCase.evaluators,
3399
+ candidate,
3400
+ target,
3401
+ provider,
3402
+ evaluatorRegistry: evaluators,
3403
+ attempt,
3404
+ promptInputs,
3405
+ now,
3406
+ judgeProvider,
3407
+ agentTimeoutMs
3408
+ });
3409
+ }
3410
+ const evaluatorKind = evalCase.evaluator ?? "llm_judge";
3411
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
3412
+ if (!activeEvaluator) {
3413
+ throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
3414
+ }
3415
+ const score = await activeEvaluator.evaluate({
3416
+ evalCase,
3417
+ candidate,
3418
+ target,
3419
+ provider,
3420
+ attempt,
3421
+ promptInputs,
3422
+ now,
3423
+ judgeProvider
3424
+ });
3425
+ return { score };
3426
+ }
3427
+ async function runEvaluatorList(options) {
3428
+ const {
3429
+ evalCase,
3430
+ evaluators,
3431
+ candidate,
3432
+ target,
3433
+ provider,
3434
+ evaluatorRegistry,
3435
+ attempt,
3436
+ promptInputs,
3437
+ now,
3438
+ judgeProvider,
3439
+ agentTimeoutMs
3440
+ } = options;
3441
+ const scored = [];
3442
+ const evaluatorResults = [];
3443
+ for (const evaluator of evaluators ?? []) {
3444
+ try {
3445
+ if (evaluator.type === "llm_judge") {
3446
+ const score2 = await runLlmJudgeEvaluator({
3447
+ config: evaluator,
3448
+ evalCase,
3449
+ candidate,
3450
+ target,
3451
+ provider,
3452
+ evaluatorRegistry,
3453
+ attempt,
3454
+ promptInputs,
3455
+ now,
3456
+ judgeProvider
3457
+ });
3458
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3459
+ evaluatorResults.push({
3460
+ name: evaluator.name,
3461
+ type: evaluator.type,
3462
+ score: score2.score,
3463
+ hits: score2.hits,
3464
+ misses: score2.misses,
3465
+ reasoning: score2.reasoning,
3466
+ evaluator_raw_request: score2.evaluatorRawRequest
3467
+ });
3468
+ continue;
3469
+ }
3470
+ if (evaluator.type === "code") {
3471
+ const codeEvaluator = new CodeEvaluator({
3472
+ script: evaluator.script,
3473
+ cwd: evaluator.resolvedCwd ?? evaluator.cwd,
3474
+ agentTimeoutMs
3475
+ });
3476
+ const score2 = await codeEvaluator.evaluate({
3477
+ evalCase,
3478
+ candidate,
3479
+ target,
3480
+ provider,
3481
+ attempt,
3482
+ promptInputs,
3483
+ now
3484
+ });
3485
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3486
+ evaluatorResults.push({
3487
+ name: evaluator.name,
3488
+ type: evaluator.type,
3489
+ score: score2.score,
3490
+ hits: score2.hits,
3491
+ misses: score2.misses,
3492
+ reasoning: score2.reasoning,
3493
+ evaluator_raw_request: score2.evaluatorRawRequest
3494
+ });
3495
+ continue;
3496
+ }
3497
+ } catch (error) {
3498
+ const message = error instanceof Error ? error.message : String(error);
3499
+ const fallbackScore = {
3500
+ score: 0,
3501
+ hits: [],
3502
+ misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
3503
+ expectedAspectCount: 1,
3504
+ reasoning: message
3505
+ };
3506
+ scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
3507
+ evaluatorResults.push({
3508
+ name: evaluator.name ?? "unknown",
3509
+ type: evaluator.type ?? "unknown",
3510
+ score: 0,
3511
+ hits: [],
3512
+ misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
3513
+ reasoning: message
3514
+ });
3515
+ }
3516
+ }
3517
+ const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
3518
+ const hits = scored.flatMap((entry) => entry.score.hits);
3519
+ const misses = scored.flatMap((entry) => entry.score.misses);
3520
+ const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
3521
+ const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
3522
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
3523
+ const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
3524
+ const score = {
3525
+ score: aggregateScore,
3526
+ hits,
3527
+ misses,
3528
+ expectedAspectCount,
3529
+ reasoning,
3530
+ rawAspects: rawAspects.length > 0 ? rawAspects : void 0
3531
+ };
3532
+ return { score, evaluatorResults };
3533
+ }
3534
+ async function runLlmJudgeEvaluator(options) {
3535
+ const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
3536
+ const customPrompt = await resolveCustomPrompt(config);
3537
+ return evaluatorRegistry.llm_judge.evaluate({
3538
+ evalCase,
3539
+ candidate,
3540
+ target,
3541
+ provider,
3542
+ attempt,
3543
+ promptInputs,
3544
+ now,
3545
+ judgeProvider,
3546
+ systemPrompt: customPrompt,
3547
+ evaluator: config,
3548
+ judgeModel: config.model
3549
+ });
3550
+ }
3551
+ async function resolveCustomPrompt(config) {
3552
+ if (config.promptPath) {
3553
+ try {
3554
+ return await (0, import_promises6.readFile)(config.promptPath, "utf8");
3555
+ } catch (error) {
3556
+ const message = error instanceof Error ? error.message : String(error);
3557
+ console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
3558
+ }
3559
+ }
3560
+ return config.prompt;
3561
+ }
3562
+ function isNonEmptyString2(value) {
3563
+ return typeof value === "string" && value.trim().length > 0;
3564
+ }
2180
3565
  function filterEvalCases(evalCases, evalId) {
2181
3566
  if (!evalId) {
2182
3567
  return evalCases;
2183
3568
  }
2184
3569
  return evalCases.filter((evalCase) => evalCase.id === evalId);
2185
3570
  }
2186
- function buildGraderRegistry(overrides, resolveJudgeProvider) {
2187
- const heuristic = overrides?.heuristic ?? new HeuristicGrader();
2188
- const llmJudge = overrides?.llm_judge ?? new QualityGrader({
3571
+ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
3572
+ const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
2189
3573
  resolveJudgeProvider: async (context) => {
2190
3574
  if (context.judgeProvider) {
2191
3575
  return context.judgeProvider;
@@ -2195,22 +3579,21 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
2195
3579
  });
2196
3580
  return {
2197
3581
  ...overrides,
2198
- heuristic,
2199
3582
  llm_judge: llmJudge
2200
3583
  };
2201
3584
  }
2202
3585
  async function dumpPrompt(directory, evalCase, promptInputs) {
2203
3586
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2204
3587
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
2205
- const filePath = import_node_path5.default.resolve(directory, filename);
2206
- await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
3588
+ const filePath = import_node_path8.default.resolve(directory, filename);
3589
+ await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
2207
3590
  const payload = {
2208
3591
  eval_id: evalCase.id,
2209
3592
  request: promptInputs.request,
2210
3593
  guidelines: promptInputs.guidelines,
2211
3594
  guideline_paths: evalCase.guideline_paths
2212
3595
  };
2213
- await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
3596
+ await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
2214
3597
  }
2215
3598
  function sanitizeFilename(value) {
2216
3599
  if (!value) {
@@ -2220,7 +3603,7 @@ function sanitizeFilename(value) {
2220
3603
  return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
2221
3604
  }
2222
3605
  async function invokeProvider(provider, options) {
2223
- const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
3606
+ const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
2224
3607
  const controller = new AbortController();
2225
3608
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2226
3609
  if (signal) {
@@ -2231,7 +3614,7 @@ async function invokeProvider(provider, options) {
2231
3614
  prompt: promptInputs.request,
2232
3615
  guidelines: promptInputs.guidelines,
2233
3616
  guideline_patterns: evalCase.guideline_patterns,
2234
- attachments: evalCase.file_paths,
3617
+ inputFiles: evalCase.file_paths,
2235
3618
  evalCaseId: evalCase.id,
2236
3619
  attempt,
2237
3620
  metadata: {
@@ -2300,25 +3683,20 @@ function createAgentKernel() {
2300
3683
  }
2301
3684
  // Annotate the CommonJS export names for ESM import in node:
2302
3685
  0 && (module.exports = {
2303
- GRADER_KINDS,
2304
- HeuristicGrader,
2305
- QualityGrader,
3686
+ CodeEvaluator,
3687
+ LlmJudgeEvaluator,
2306
3688
  TEST_MESSAGE_ROLES,
2307
3689
  buildDirectoryChain,
2308
3690
  buildPromptInputs,
2309
3691
  buildSearchRoots,
2310
- calculateHits,
2311
- calculateMisses,
2312
3692
  createAgentKernel,
2313
3693
  createProvider,
2314
3694
  ensureVSCodeSubagents,
2315
- extractAspects,
2316
3695
  extractCodeBlocks,
2317
3696
  fileExists,
2318
3697
  findGitRoot,
2319
3698
  getHitCount,
2320
- isErrorLike,
2321
- isGraderKind,
3699
+ isEvaluatorKind,
2322
3700
  isGuidelineFile,
2323
3701
  isJsonObject,
2324
3702
  isJsonValue,
@@ -2331,7 +3709,6 @@ function createAgentKernel() {
2331
3709
  resolveFileReference,
2332
3710
  resolveTargetDefinition,
2333
3711
  runEvalCase,
2334
- runEvaluation,
2335
- scoreCandidateResponse
3712
+ runEvaluation
2336
3713
  });
2337
3714
  //# sourceMappingURL=index.cjs.map