@agentv/core 0.2.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ import {
5
5
  fileExists,
6
6
  findGitRoot,
7
7
  resolveFileReference
8
- } from "./chunk-XXNQA4EW.js";
8
+ } from "./chunk-NL7K4CAK.js";
9
9
 
10
10
  // src/evaluation/types.ts
11
11
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -48,11 +48,10 @@ function isTestMessage(value) {
48
48
  }
49
49
  return candidate.content.every(isJsonObject);
50
50
  }
51
- var GRADER_KIND_VALUES = ["heuristic", "llm_judge"];
52
- var GRADER_KINDS = GRADER_KIND_VALUES;
53
- var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
54
- function isGraderKind(value) {
55
- return typeof value === "string" && GRADER_KIND_SET.has(value);
51
+ var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
52
+ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
53
+ function isEvaluatorKind(value) {
54
+ return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
56
55
  }
57
56
  function getHitCount(result) {
58
57
  return result.hits.length;
@@ -160,7 +159,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
160
159
  if (!Array.isArray(rawTestcases)) {
161
160
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
162
161
  }
163
- const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
162
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
164
163
  const results = [];
165
164
  for (const rawEvalcase of rawTestcases) {
166
165
  if (!isJsonObject(rawEvalcase)) {
@@ -283,7 +282,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
283
282
  const assistantContent = assistantMessages[0]?.content;
284
283
  const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
285
284
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
286
- const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
285
+ const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
286
+ const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
287
287
  const userFilePaths = [];
288
288
  for (const segment of userSegments) {
289
289
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -306,7 +306,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
306
306
  file_paths: allFilePaths,
307
307
  code_snippets: codeSnippets,
308
308
  outcome,
309
- grader: testCaseGrader
309
+ evaluator: testCaseEvaluatorKind,
310
+ evaluators
310
311
  };
311
312
  if (verbose) {
312
313
  console.log(`
@@ -467,14 +468,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
467
468
  }
468
469
  return parts.join(" ");
469
470
  }
470
- function coerceGrader(candidate) {
471
+ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
472
+ const execution = rawEvalCase.execution;
473
+ const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
474
+ if (candidateEvaluators === void 0) {
475
+ return void 0;
476
+ }
477
+ if (!Array.isArray(candidateEvaluators)) {
478
+ logWarning(`Skipping evaluators for '${evalId}': expected array`);
479
+ return void 0;
480
+ }
481
+ const evaluators = [];
482
+ for (const rawEvaluator of candidateEvaluators) {
483
+ if (!isJsonObject(rawEvaluator)) {
484
+ logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
485
+ continue;
486
+ }
487
+ const name = asString(rawEvaluator.name);
488
+ const typeValue = rawEvaluator.type;
489
+ if (!name || !isEvaluatorKind(typeValue)) {
490
+ logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
491
+ continue;
492
+ }
493
+ if (typeValue === "code") {
494
+ const script = asString(rawEvaluator.script);
495
+ if (!script) {
496
+ logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
497
+ continue;
498
+ }
499
+ const cwd = asString(rawEvaluator.cwd);
500
+ let resolvedCwd;
501
+ if (cwd) {
502
+ const resolved = await resolveFileReference(cwd, searchRoots);
503
+ if (resolved.resolvedPath) {
504
+ resolvedCwd = path.resolve(resolved.resolvedPath);
505
+ } else {
506
+ logWarning(
507
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
508
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
509
+ );
510
+ }
511
+ }
512
+ evaluators.push({
513
+ name,
514
+ type: "code",
515
+ script,
516
+ cwd,
517
+ resolvedCwd
518
+ });
519
+ continue;
520
+ }
521
+ const prompt = asString(rawEvaluator.prompt);
522
+ let promptPath;
523
+ if (prompt) {
524
+ const resolved = await resolveFileReference(prompt, searchRoots);
525
+ if (resolved.resolvedPath) {
526
+ promptPath = path.resolve(resolved.resolvedPath);
527
+ } else {
528
+ logWarning(
529
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
530
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
531
+ );
532
+ }
533
+ }
534
+ const model = asString(rawEvaluator.model);
535
+ evaluators.push({
536
+ name,
537
+ type: "llm_judge",
538
+ prompt,
539
+ promptPath,
540
+ model
541
+ });
542
+ }
543
+ return evaluators.length > 0 ? evaluators : void 0;
544
+ }
545
+ function coerceEvaluator(candidate, contextId) {
471
546
  if (typeof candidate !== "string") {
472
547
  return void 0;
473
548
  }
474
- if (isGraderKind(candidate)) {
549
+ if (isEvaluatorKind(candidate)) {
475
550
  return candidate;
476
551
  }
477
- logWarning(`Unknown grader '${candidate}', falling back to default`);
552
+ logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
478
553
  return void 0;
479
554
  }
480
555
  function logWarning(message, details) {
@@ -670,6 +745,214 @@ var GeminiProvider = class {
670
745
  }
671
746
  };
672
747
 
748
+ // src/evaluation/providers/cli.ts
749
+ import { exec as execWithCallback } from "node:child_process";
750
+ import path2 from "node:path";
751
+ import { promisify } from "node:util";
752
+ var execAsync = promisify(execWithCallback);
753
+ var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
754
+ async function defaultCommandRunner(command, options) {
755
+ const execOptions = {
756
+ cwd: options.cwd,
757
+ env: options.env,
758
+ timeout: options.timeoutMs,
759
+ signal: options.signal,
760
+ maxBuffer: DEFAULT_MAX_BUFFER,
761
+ shell: process.platform === "win32" ? "powershell.exe" : void 0
762
+ };
763
+ try {
764
+ const { stdout, stderr } = await execAsync(command, execOptions);
765
+ return {
766
+ stdout,
767
+ stderr,
768
+ exitCode: 0,
769
+ failed: false,
770
+ timedOut: false,
771
+ signal: null
772
+ };
773
+ } catch (error) {
774
+ const execError = error;
775
+ return {
776
+ stdout: execError.stdout ?? "",
777
+ stderr: execError.stderr ?? "",
778
+ exitCode: typeof execError.code === "number" ? execError.code : null,
779
+ failed: true,
780
+ timedOut: execError.timedOut === true || execError.killed === true,
781
+ signal: execError.signal ?? null
782
+ };
783
+ }
784
+ }
785
+ var CliProvider = class {
786
+ id;
787
+ kind = "cli";
788
+ targetName;
789
+ supportsBatch = false;
790
+ config;
791
+ runCommand;
792
+ healthcheckPromise;
793
+ constructor(targetName, config, runner = defaultCommandRunner) {
794
+ this.targetName = targetName;
795
+ this.id = `cli:${targetName}`;
796
+ this.config = config;
797
+ this.runCommand = runner;
798
+ }
799
+ async invoke(request) {
800
+ if (request.signal?.aborted) {
801
+ throw new Error("CLI provider request was aborted before execution");
802
+ }
803
+ await this.ensureHealthy(request.signal);
804
+ const templateValues = buildTemplateValues(request, this.config);
805
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
806
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
807
+ const result = await this.runCommand(renderedCommand, {
808
+ cwd: this.config.cwd,
809
+ env,
810
+ timeoutMs: this.config.timeoutMs,
811
+ signal: request.signal
812
+ });
813
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
814
+ if (request.signal?.aborted) {
815
+ throw new Error("CLI provider request was aborted");
816
+ }
817
+ if (result.timedOut) {
818
+ throw new Error(
819
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
820
+ );
821
+ }
822
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
823
+ const detail = result.stderr.trim() || result.stdout.trim();
824
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
825
+ throw new Error(message);
826
+ }
827
+ return {
828
+ text: result.stdout,
829
+ raw: {
830
+ command: renderedCommand,
831
+ stderr: result.stderr,
832
+ exitCode: result.exitCode ?? 0,
833
+ cwd: this.config.cwd
834
+ }
835
+ };
836
+ }
837
+ async ensureHealthy(signal) {
838
+ if (!this.config.healthcheck) {
839
+ return;
840
+ }
841
+ if (!this.healthcheckPromise) {
842
+ this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
843
+ }
844
+ return this.healthcheckPromise;
845
+ }
846
+ async runHealthcheck(healthcheck, signal) {
847
+ if (!healthcheck) {
848
+ return;
849
+ }
850
+ const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
851
+ if (healthcheck.type === "http") {
852
+ const controller = new AbortController();
853
+ const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
854
+ signal?.addEventListener("abort", () => controller.abort(), { once: true });
855
+ try {
856
+ const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
857
+ if (!response.ok) {
858
+ throw new Error(`HTTP ${response.status} ${response.statusText}`);
859
+ }
860
+ } catch (error) {
861
+ const reason = error instanceof Error ? error.message : String(error);
862
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
863
+ } finally {
864
+ if (timer !== void 0) {
865
+ clearTimeout(timer);
866
+ }
867
+ }
868
+ return;
869
+ }
870
+ const renderedCommand = renderTemplate(
871
+ healthcheck.commandTemplate,
872
+ buildTemplateValues(
873
+ {
874
+ prompt: "",
875
+ guidelines: "",
876
+ inputFiles: [],
877
+ evalCaseId: "",
878
+ attempt: 0
879
+ },
880
+ this.config
881
+ )
882
+ );
883
+ const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
884
+ const result = await this.runCommand(renderedCommand, {
885
+ cwd: healthcheck.cwd ?? this.config.cwd,
886
+ env,
887
+ timeoutMs,
888
+ signal
889
+ });
890
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
891
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
892
+ const detail = result.stderr.trim() || result.stdout.trim();
893
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
894
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
895
+ }
896
+ }
897
+ };
898
+ function buildTemplateValues(request, config) {
899
+ const inputFiles = normalizeInputFiles(request.inputFiles);
900
+ return {
901
+ PROMPT: shellEscape(request.prompt ?? ""),
902
+ GUIDELINES: shellEscape(request.guidelines ?? ""),
903
+ EVAL_ID: shellEscape(request.evalCaseId ?? ""),
904
+ ATTEMPT: shellEscape(String(request.attempt ?? 0)),
905
+ FILES: formatFileList(inputFiles, config.filesFormat)
906
+ };
907
+ }
908
+ function normalizeInputFiles(inputFiles) {
909
+ if (!inputFiles || inputFiles.length === 0) {
910
+ return void 0;
911
+ }
912
+ const unique = /* @__PURE__ */ new Map();
913
+ for (const inputFile of inputFiles) {
914
+ const absolutePath = path2.resolve(inputFile);
915
+ if (!unique.has(absolutePath)) {
916
+ unique.set(absolutePath, absolutePath);
917
+ }
918
+ }
919
+ return Array.from(unique.values());
920
+ }
921
+ function formatFileList(files, template) {
922
+ if (!files || files.length === 0) {
923
+ return "";
924
+ }
925
+ const formatter = template ?? "{path}";
926
+ return files.map((filePath) => {
927
+ const escapedPath = shellEscape(filePath);
928
+ const escapedName = shellEscape(path2.basename(filePath));
929
+ return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
930
+ }).join(" ");
931
+ }
932
+ function renderTemplate(template, values) {
933
+ return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
934
+ const replacement = values[key];
935
+ return replacement !== void 0 ? replacement : match;
936
+ });
937
+ }
938
+ function shellEscape(value) {
939
+ if (value.length === 0) {
940
+ return "''";
941
+ }
942
+ if (process.platform === "win32") {
943
+ const escaped = value.replace(/"/g, '\\"');
944
+ return `"${escaped}"`;
945
+ }
946
+ return `'${value.replace(/'/g, `'"'"'`)}'`;
947
+ }
948
+ function formatTimeoutSuffix(timeoutMs) {
949
+ if (!timeoutMs || timeoutMs <= 0) {
950
+ return "";
951
+ }
952
+ const seconds = Math.ceil(timeoutMs / 1e3);
953
+ return ` after ${seconds}s`;
954
+ }
955
+
673
956
  // src/evaluation/providers/mock.ts
674
957
  var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
675
958
  var MockProvider = class {
@@ -713,6 +996,7 @@ var MockProvider = class {
713
996
 
714
997
  // src/evaluation/providers/targets.ts
715
998
  import { z } from "zod";
999
+ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
716
1000
  var BASE_TARGET_SCHEMA = z.object({
717
1001
  name: z.string().min(1, "target name is required"),
718
1002
  provider: z.string().min(1, "provider is required"),
@@ -735,6 +1019,9 @@ function normalizeAzureApiVersion(value) {
735
1019
  function resolveTargetDefinition(definition, env = process.env) {
736
1020
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
737
1021
  const provider = parsed.provider.toLowerCase();
1022
+ const providerBatching = resolveOptionalBoolean(
1023
+ parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
1024
+ );
738
1025
  switch (provider) {
739
1026
  case "azure":
740
1027
  case "azure-openai":
@@ -743,6 +1030,7 @@ function resolveTargetDefinition(definition, env = process.env) {
743
1030
  name: parsed.name,
744
1031
  judgeTarget: parsed.judge_target,
745
1032
  workers: parsed.workers,
1033
+ providerBatching,
746
1034
  config: resolveAzureConfig(parsed, env)
747
1035
  };
748
1036
  case "anthropic":
@@ -751,6 +1039,7 @@ function resolveTargetDefinition(definition, env = process.env) {
751
1039
  name: parsed.name,
752
1040
  judgeTarget: parsed.judge_target,
753
1041
  workers: parsed.workers,
1042
+ providerBatching,
754
1043
  config: resolveAnthropicConfig(parsed, env)
755
1044
  };
756
1045
  case "gemini":
@@ -761,14 +1050,26 @@ function resolveTargetDefinition(definition, env = process.env) {
761
1050
  name: parsed.name,
762
1051
  judgeTarget: parsed.judge_target,
763
1052
  workers: parsed.workers,
1053
+ providerBatching,
764
1054
  config: resolveGeminiConfig(parsed, env)
765
1055
  };
1056
+ case "codex":
1057
+ case "codex-cli":
1058
+ return {
1059
+ kind: "codex",
1060
+ name: parsed.name,
1061
+ judgeTarget: parsed.judge_target,
1062
+ workers: parsed.workers,
1063
+ providerBatching,
1064
+ config: resolveCodexConfig(parsed, env)
1065
+ };
766
1066
  case "mock":
767
1067
  return {
768
1068
  kind: "mock",
769
1069
  name: parsed.name,
770
1070
  judgeTarget: parsed.judge_target,
771
1071
  workers: parsed.workers,
1072
+ providerBatching,
772
1073
  config: resolveMockConfig(parsed)
773
1074
  };
774
1075
  case "vscode":
@@ -778,8 +1079,18 @@ function resolveTargetDefinition(definition, env = process.env) {
778
1079
  name: parsed.name,
779
1080
  judgeTarget: parsed.judge_target,
780
1081
  workers: parsed.workers,
1082
+ providerBatching,
781
1083
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
782
1084
  };
1085
+ case "cli":
1086
+ return {
1087
+ kind: "cli",
1088
+ name: parsed.name,
1089
+ judgeTarget: parsed.judge_target,
1090
+ workers: parsed.workers,
1091
+ providerBatching,
1092
+ config: resolveCliConfig(parsed, env)
1093
+ };
783
1094
  default:
784
1095
  throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
785
1096
  }
@@ -847,6 +1158,29 @@ function resolveGeminiConfig(target, env) {
847
1158
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
848
1159
  };
849
1160
  }
1161
+ function resolveCodexConfig(target, env) {
1162
+ const settings = target.settings ?? {};
1163
+ const executableSource = settings.executable ?? settings.command ?? settings.binary;
1164
+ const argsSource = settings.args ?? settings.arguments;
1165
+ const cwdSource = settings.cwd;
1166
+ const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
1167
+ const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
1168
+ allowLiteral: true,
1169
+ optionalEnv: true
1170
+ }) ?? "codex";
1171
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
1172
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
1173
+ allowLiteral: true,
1174
+ optionalEnv: true
1175
+ });
1176
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
1177
+ return {
1178
+ executable,
1179
+ args,
1180
+ cwd,
1181
+ timeoutMs
1182
+ };
1183
+ }
850
1184
  function resolveMockConfig(target) {
851
1185
  const settings = target.settings ?? {};
852
1186
  const response = typeof settings.response === "string" ? settings.response : void 0;
@@ -876,6 +1210,125 @@ function resolveVSCodeConfig(target, env, insiders) {
876
1210
  workspaceTemplate
877
1211
  };
878
1212
  }
1213
+ function resolveCliConfig(target, env) {
1214
+ const settings = target.settings ?? {};
1215
+ const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
1216
+ const filesFormat = resolveOptionalLiteralString(
1217
+ settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
1218
+ );
1219
+ const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
1220
+ allowLiteral: true,
1221
+ optionalEnv: true
1222
+ });
1223
+ const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
1224
+ const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
1225
+ const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
1226
+ const commandTemplate = resolveString(
1227
+ commandTemplateSource,
1228
+ env,
1229
+ `${target.name} CLI command template`,
1230
+ true
1231
+ );
1232
+ assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
1233
+ return {
1234
+ commandTemplate,
1235
+ filesFormat,
1236
+ cwd,
1237
+ env: envOverrides,
1238
+ timeoutMs,
1239
+ healthcheck
1240
+ };
1241
+ }
1242
+ function resolveEnvOverrides(source, env, targetName) {
1243
+ if (source === void 0 || source === null) {
1244
+ return void 0;
1245
+ }
1246
+ if (typeof source !== "object" || Array.isArray(source)) {
1247
+ throw new Error(`${targetName} env overrides must be an object map of strings`);
1248
+ }
1249
+ const entries = Object.entries(source);
1250
+ const resolved = {};
1251
+ for (const [key, value] of entries) {
1252
+ if (typeof value !== "string") {
1253
+ throw new Error(`${targetName} env override '${key}' must be a string`);
1254
+ }
1255
+ const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
1256
+ resolved[key] = resolvedValue;
1257
+ }
1258
+ return Object.keys(resolved).length > 0 ? resolved : void 0;
1259
+ }
1260
+ function resolveTimeoutMs(source, description) {
1261
+ const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
1262
+ if (seconds === void 0) {
1263
+ return void 0;
1264
+ }
1265
+ if (seconds <= 0) {
1266
+ throw new Error(`${description} must be greater than zero seconds`);
1267
+ }
1268
+ return Math.floor(seconds * 1e3);
1269
+ }
1270
+ function resolveCliHealthcheck(source, env, targetName) {
1271
+ if (source === void 0 || source === null) {
1272
+ return void 0;
1273
+ }
1274
+ if (typeof source !== "object" || Array.isArray(source)) {
1275
+ throw new Error(`${targetName} healthcheck must be an object`);
1276
+ }
1277
+ const candidate = source;
1278
+ const type = candidate.type;
1279
+ const timeoutMs = resolveTimeoutMs(
1280
+ candidate.timeout_seconds ?? candidate.timeoutSeconds,
1281
+ `${targetName} healthcheck timeout`
1282
+ );
1283
+ if (type === "http") {
1284
+ const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
1285
+ return {
1286
+ type: "http",
1287
+ url,
1288
+ timeoutMs
1289
+ };
1290
+ }
1291
+ if (type === "command") {
1292
+ const commandTemplate = resolveString(
1293
+ candidate.command_template ?? candidate.commandTemplate,
1294
+ env,
1295
+ `${targetName} healthcheck command template`,
1296
+ true
1297
+ );
1298
+ assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
1299
+ const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
1300
+ allowLiteral: true,
1301
+ optionalEnv: true
1302
+ });
1303
+ return {
1304
+ type: "command",
1305
+ commandTemplate,
1306
+ timeoutMs,
1307
+ cwd
1308
+ };
1309
+ }
1310
+ throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
1311
+ }
1312
+ function assertSupportedCliPlaceholders(template, description) {
1313
+ const placeholders = extractCliPlaceholders(template);
1314
+ for (const placeholder of placeholders) {
1315
+ if (!CLI_PLACEHOLDERS.has(placeholder)) {
1316
+ throw new Error(
1317
+ `${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
1318
+ );
1319
+ }
1320
+ }
1321
+ }
1322
+ function extractCliPlaceholders(template) {
1323
+ const matches = template.matchAll(/\{([A-Z_]+)\}/g);
1324
+ const results = [];
1325
+ for (const match of matches) {
1326
+ if (match[1]) {
1327
+ results.push(match[1]);
1328
+ }
1329
+ }
1330
+ return results;
1331
+ }
879
1332
  function resolveString(source, env, description, allowLiteral = false) {
880
1333
  const value = resolveOptionalString(source, env, description, {
881
1334
  allowLiteral,
@@ -906,11 +1359,14 @@ function resolveOptionalString(source, env, description, options) {
906
1359
  }
907
1360
  const allowLiteral = options?.allowLiteral ?? false;
908
1361
  const optionalEnv = options?.optionalEnv ?? false;
909
- if (!allowLiteral && isLikelyEnvReference(trimmed)) {
1362
+ const looksLikeEnv = isLikelyEnvReference(trimmed);
1363
+ if (looksLikeEnv) {
910
1364
  if (optionalEnv) {
911
1365
  return void 0;
912
1366
  }
913
- throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
1367
+ if (!allowLiteral) {
1368
+ throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
1369
+ }
914
1370
  }
915
1371
  return trimmed;
916
1372
  }
@@ -960,15 +1416,48 @@ function resolveOptionalBoolean(source) {
960
1416
  function isLikelyEnvReference(value) {
961
1417
  return /^[A-Z0-9_]+$/.test(value);
962
1418
  }
1419
+ function resolveOptionalStringArray(source, env, description) {
1420
+ if (source === void 0 || source === null) {
1421
+ return void 0;
1422
+ }
1423
+ if (!Array.isArray(source)) {
1424
+ throw new Error(`${description} must be an array of strings`);
1425
+ }
1426
+ if (source.length === 0) {
1427
+ return void 0;
1428
+ }
1429
+ const resolved = [];
1430
+ for (let i = 0; i < source.length; i++) {
1431
+ const item = source[i];
1432
+ if (typeof item !== "string") {
1433
+ throw new Error(`${description}[${i}] must be a string`);
1434
+ }
1435
+ const trimmed = item.trim();
1436
+ if (trimmed.length === 0) {
1437
+ throw new Error(`${description}[${i}] cannot be empty`);
1438
+ }
1439
+ const envValue = env[trimmed];
1440
+ if (envValue !== void 0) {
1441
+ if (envValue.trim().length === 0) {
1442
+ throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
1443
+ }
1444
+ resolved.push(envValue);
1445
+ } else {
1446
+ resolved.push(trimmed);
1447
+ }
1448
+ }
1449
+ return resolved.length > 0 ? resolved : void 0;
1450
+ }
963
1451
 
964
1452
  // src/evaluation/providers/vscode.ts
965
1453
  import { readFile as readFile2 } from "node:fs/promises";
966
- import path2 from "node:path";
967
- import { dispatchAgentSession, getSubagentRoot, provisionSubagents } from "subagent";
1454
+ import path3 from "node:path";
1455
+ import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
968
1456
  var VSCodeProvider = class {
969
1457
  id;
970
1458
  kind;
971
1459
  targetName;
1460
+ supportsBatch = true;
972
1461
  config;
973
1462
  constructor(targetName, config, kind) {
974
1463
  this.id = `${kind}:${targetName}`;
@@ -980,12 +1469,11 @@ var VSCodeProvider = class {
980
1469
  if (request.signal?.aborted) {
981
1470
  throw new Error("VS Code provider request was aborted before dispatch");
982
1471
  }
983
- const attachments = normalizeAttachments(request.attachments);
984
- const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
1472
+ const inputFiles = normalizeAttachments(request.inputFiles);
1473
+ const promptContent = buildPromptDocument(request, inputFiles, request.guideline_patterns);
985
1474
  const session = await dispatchAgentSession({
986
1475
  userQuery: promptContent,
987
- // Use full prompt content instead of just request.prompt
988
- extraAttachments: attachments,
1476
+ extraAttachments: inputFiles,
989
1477
  wait: this.config.waitForResponse,
990
1478
  dryRun: this.config.dryRun,
991
1479
  vscodeCmd: this.config.command,
@@ -1002,7 +1490,7 @@ var VSCodeProvider = class {
1002
1490
  text: "",
1003
1491
  raw: {
1004
1492
  session,
1005
- attachments
1493
+ inputFiles
1006
1494
  }
1007
1495
  };
1008
1496
  }
@@ -1011,42 +1499,106 @@ var VSCodeProvider = class {
1011
1499
  text: responseText,
1012
1500
  raw: {
1013
1501
  session,
1014
- attachments
1502
+ inputFiles
1015
1503
  }
1016
1504
  };
1017
1505
  }
1506
+ async invokeBatch(requests) {
1507
+ if (requests.length === 0) {
1508
+ return [];
1509
+ }
1510
+ const normalizedRequests = requests.map((req) => ({
1511
+ request: req,
1512
+ inputFiles: normalizeAttachments(req.inputFiles)
1513
+ }));
1514
+ const combinedInputFiles = mergeAttachments(
1515
+ normalizedRequests.map(({ inputFiles }) => inputFiles)
1516
+ );
1517
+ const userQueries = normalizedRequests.map(
1518
+ ({ request, inputFiles }) => buildPromptDocument(request, inputFiles, request.guideline_patterns)
1519
+ );
1520
+ const session = await dispatchBatchAgent({
1521
+ userQueries,
1522
+ extraAttachments: combinedInputFiles,
1523
+ wait: this.config.waitForResponse,
1524
+ dryRun: this.config.dryRun,
1525
+ vscodeCmd: this.config.command,
1526
+ subagentRoot: this.config.subagentRoot,
1527
+ workspaceTemplate: this.config.workspaceTemplate,
1528
+ silent: true
1529
+ });
1530
+ if (session.exitCode !== 0 || !session.responseFiles) {
1531
+ const failure = session.error ?? "VS Code subagent did not produce batch responses";
1532
+ throw new Error(failure);
1533
+ }
1534
+ if (this.config.dryRun) {
1535
+ return normalizedRequests.map(({ inputFiles }) => ({
1536
+ text: "",
1537
+ raw: {
1538
+ session,
1539
+ inputFiles,
1540
+ allInputFiles: combinedInputFiles
1541
+ }
1542
+ }));
1543
+ }
1544
+ if (session.responseFiles.length !== requests.length) {
1545
+ throw new Error(
1546
+ `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
1547
+ );
1548
+ }
1549
+ const responses = [];
1550
+ for (const [index, responseFile] of session.responseFiles.entries()) {
1551
+ const responseText = await readFile2(responseFile, "utf8");
1552
+ responses.push({
1553
+ text: responseText,
1554
+ raw: {
1555
+ session,
1556
+ inputFiles: normalizedRequests[index]?.inputFiles,
1557
+ allInputFiles: combinedInputFiles,
1558
+ responseFile
1559
+ }
1560
+ });
1561
+ }
1562
+ return responses;
1563
+ }
1018
1564
  };
1019
1565
  function buildPromptDocument(request, attachments, guidelinePatterns) {
1020
1566
  const parts = [];
1021
1567
  const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
1022
- if (guidelineFiles.length > 0) {
1023
- parts.push("\n", buildMandatoryPrereadBlock(guidelineFiles));
1568
+ const attachmentFiles = collectAttachmentFiles(attachments);
1569
+ const nonGuidelineAttachments = attachmentFiles.filter(
1570
+ (file) => !guidelineFiles.includes(file)
1571
+ );
1572
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
1573
+ if (prereadBlock.length > 0) {
1574
+ parts.push("\n", prereadBlock);
1024
1575
  }
1025
1576
  parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1026
1577
  return parts.join("\n").trim();
1027
1578
  }
1028
- function buildMandatoryPrereadBlock(guidelineFiles) {
1029
- if (guidelineFiles.length === 0) {
1579
+ function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
1580
+ if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
1030
1581
  return "";
1031
1582
  }
1032
- const fileList = [];
1033
- let counter = 0;
1034
- for (const absolutePath of guidelineFiles) {
1035
- counter += 1;
1036
- const fileName = path2.basename(absolutePath);
1583
+ const buildList = (files) => files.map((absolutePath) => {
1584
+ const fileName = path3.basename(absolutePath);
1037
1585
  const fileUri = pathToFileUri(absolutePath);
1038
- fileList.push(`* [${fileName}](${fileUri})`);
1039
- }
1040
- const filesText = fileList.join("\n");
1041
- const instruction = [
1042
- `Read all guideline files:
1043
- ${filesText}.
1044
- `,
1045
- `If any file is missing, fail with ERROR: missing-file <filename> and stop.
1046
- `,
1047
- `Then apply system_instructions on the user query below.`
1048
- ].join("");
1049
- return `${instruction}`;
1586
+ return `* [${fileName}](${fileUri})`;
1587
+ });
1588
+ const sections = [];
1589
+ if (guidelineFiles.length > 0) {
1590
+ sections.push(`Read all guideline files:
1591
+ ${buildList(guidelineFiles).join("\n")}.`);
1592
+ }
1593
+ if (attachmentFiles.length > 0) {
1594
+ sections.push(`Read all attachment files:
1595
+ ${buildList(attachmentFiles).join("\n")}.`);
1596
+ }
1597
+ sections.push(
1598
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1599
+ "Then apply system_instructions on the user query below."
1600
+ );
1601
+ return sections.join("\n");
1050
1602
  }
1051
1603
  function collectGuidelineFiles(attachments, guidelinePatterns) {
1052
1604
  if (!attachments || attachments.length === 0) {
@@ -1054,8 +1606,8 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
1054
1606
  }
1055
1607
  const unique = /* @__PURE__ */ new Map();
1056
1608
  for (const attachment of attachments) {
1057
- const absolutePath = path2.resolve(attachment);
1058
- const normalized = absolutePath.split(path2.sep).join("/");
1609
+ const absolutePath = path3.resolve(attachment);
1610
+ const normalized = absolutePath.split(path3.sep).join("/");
1059
1611
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1060
1612
  if (!unique.has(absolutePath)) {
1061
1613
  unique.set(absolutePath, absolutePath);
@@ -1064,8 +1616,21 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
1064
1616
  }
1065
1617
  return Array.from(unique.values());
1066
1618
  }
1619
+ function collectAttachmentFiles(attachments) {
1620
+ if (!attachments || attachments.length === 0) {
1621
+ return [];
1622
+ }
1623
+ const unique = /* @__PURE__ */ new Map();
1624
+ for (const attachment of attachments) {
1625
+ const absolutePath = path3.resolve(attachment);
1626
+ if (!unique.has(absolutePath)) {
1627
+ unique.set(absolutePath, absolutePath);
1628
+ }
1629
+ }
1630
+ return Array.from(unique.values());
1631
+ }
1067
1632
  function pathToFileUri(filePath) {
1068
- const absolutePath = path2.isAbsolute(filePath) ? filePath : path2.resolve(filePath);
1633
+ const absolutePath = path3.isAbsolute(filePath) ? filePath : path3.resolve(filePath);
1069
1634
  const normalizedPath = absolutePath.replace(/\\/g, "/");
1070
1635
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1071
1636
  return `file:///${normalizedPath}`;
@@ -1078,10 +1643,20 @@ function normalizeAttachments(attachments) {
1078
1643
  }
1079
1644
  const deduped = /* @__PURE__ */ new Set();
1080
1645
  for (const attachment of attachments) {
1081
- deduped.add(path2.resolve(attachment));
1646
+ deduped.add(path3.resolve(attachment));
1082
1647
  }
1083
1648
  return Array.from(deduped);
1084
1649
  }
1650
+ function mergeAttachments(all) {
1651
+ const deduped = /* @__PURE__ */ new Set();
1652
+ for (const list of all) {
1653
+ if (!list) continue;
1654
+ for (const inputFile of list) {
1655
+ deduped.add(path3.resolve(inputFile));
1656
+ }
1657
+ }
1658
+ return deduped.size > 0 ? Array.from(deduped) : void 0;
1659
+ }
1085
1660
  async function ensureVSCodeSubagents(options) {
1086
1661
  const { kind, count, verbose = false } = options;
1087
1662
  const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
@@ -1095,36 +1670,612 @@ async function ensureVSCodeSubagents(options) {
1095
1670
  subagents: count,
1096
1671
  dryRun: false
1097
1672
  });
1098
- if (verbose) {
1099
- if (result.created.length > 0) {
1100
- console.log(`Created ${result.created.length} new subagent(s)`);
1673
+ if (verbose) {
1674
+ if (result.created.length > 0) {
1675
+ console.log(`Created ${result.created.length} new subagent(s)`);
1676
+ }
1677
+ if (result.skippedExisting.length > 0) {
1678
+ console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
1679
+ }
1680
+ console.log(`
1681
+ total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
1682
+ }
1683
+ return {
1684
+ provisioned: true,
1685
+ message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
1686
+ };
1687
+ } catch (error) {
1688
+ const errorMessage = error instanceof Error ? error.message : String(error);
1689
+ if (verbose) {
1690
+ console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
1691
+ }
1692
+ return {
1693
+ provisioned: false,
1694
+ message: `Provisioning failed: ${errorMessage}`
1695
+ };
1696
+ }
1697
+ }
1698
+
1699
+ // src/evaluation/providers/codex.ts
1700
+ import { exec as execCallback, spawn } from "node:child_process";
1701
+ import { constants as constants2 } from "node:fs";
1702
+ import { access as access2, copyFile, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
1703
+ import { tmpdir } from "node:os";
1704
+ import path5 from "node:path";
1705
+ import { promisify as promisify2 } from "node:util";
1706
+
1707
+ // src/evaluation/providers/preread.ts
1708
+ import path4 from "node:path";
1709
+ function buildPromptDocument2(request, inputFiles, options) {
1710
+ const parts = [];
1711
+ const guidelineFiles = collectGuidelineFiles2(
1712
+ inputFiles,
1713
+ options?.guidelinePatterns ?? request.guideline_patterns,
1714
+ options?.guidelineOverrides
1715
+ );
1716
+ const inputFilesList = collectInputFiles(inputFiles);
1717
+ const nonGuidelineInputFiles = inputFilesList.filter(
1718
+ (file) => !guidelineFiles.includes(file)
1719
+ );
1720
+ const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineInputFiles);
1721
+ if (prereadBlock.length > 0) {
1722
+ parts.push("\n", prereadBlock);
1723
+ }
1724
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1725
+ return parts.join("\n").trim();
1726
+ }
1727
+ function normalizeInputFiles2(inputFiles) {
1728
+ if (!inputFiles || inputFiles.length === 0) {
1729
+ return void 0;
1730
+ }
1731
+ const deduped = /* @__PURE__ */ new Map();
1732
+ for (const inputFile of inputFiles) {
1733
+ const absolutePath = path4.resolve(inputFile);
1734
+ if (!deduped.has(absolutePath)) {
1735
+ deduped.set(absolutePath, absolutePath);
1736
+ }
1737
+ }
1738
+ return Array.from(deduped.values());
1739
+ }
1740
+ function collectGuidelineFiles2(inputFiles, guidelinePatterns, overrides) {
1741
+ if (!inputFiles || inputFiles.length === 0) {
1742
+ return [];
1743
+ }
1744
+ const unique = /* @__PURE__ */ new Map();
1745
+ for (const inputFile of inputFiles) {
1746
+ const absolutePath = path4.resolve(inputFile);
1747
+ if (overrides?.has(absolutePath)) {
1748
+ if (!unique.has(absolutePath)) {
1749
+ unique.set(absolutePath, absolutePath);
1750
+ }
1751
+ continue;
1752
+ }
1753
+ const normalized = absolutePath.split(path4.sep).join("/");
1754
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1755
+ if (!unique.has(absolutePath)) {
1756
+ unique.set(absolutePath, absolutePath);
1757
+ }
1758
+ }
1759
+ }
1760
+ return Array.from(unique.values());
1761
+ }
1762
+ function collectInputFiles(inputFiles) {
1763
+ if (!inputFiles || inputFiles.length === 0) {
1764
+ return [];
1765
+ }
1766
+ const unique = /* @__PURE__ */ new Map();
1767
+ for (const inputFile of inputFiles) {
1768
+ const absolutePath = path4.resolve(inputFile);
1769
+ if (!unique.has(absolutePath)) {
1770
+ unique.set(absolutePath, absolutePath);
1771
+ }
1772
+ }
1773
+ return Array.from(unique.values());
1774
+ }
1775
+ function buildMandatoryPrereadBlock2(guidelineFiles, inputFiles) {
1776
+ if (guidelineFiles.length === 0 && inputFiles.length === 0) {
1777
+ return "";
1778
+ }
1779
+ const buildList = (files) => files.map((absolutePath) => {
1780
+ const fileName = path4.basename(absolutePath);
1781
+ const fileUri = pathToFileUri2(absolutePath);
1782
+ return `* [${fileName}](${fileUri})`;
1783
+ });
1784
+ const sections = [];
1785
+ if (guidelineFiles.length > 0) {
1786
+ sections.push(`Read all guideline files:
1787
+ ${buildList(guidelineFiles).join("\n")}.`);
1788
+ }
1789
+ if (inputFiles.length > 0) {
1790
+ sections.push(`Read all input files:
1791
+ ${buildList(inputFiles).join("\n")}.`);
1792
+ }
1793
+ sections.push(
1794
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1795
+ "Then apply system_instructions on the user query below."
1796
+ );
1797
+ return sections.join("\n");
1798
+ }
1799
+ function pathToFileUri2(filePath) {
1800
+ const absolutePath = path4.isAbsolute(filePath) ? filePath : path4.resolve(filePath);
1801
+ const normalizedPath = absolutePath.replace(/\\/g, "/");
1802
+ if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1803
+ return `file:///${normalizedPath}`;
1804
+ }
1805
+ return `file://${normalizedPath}`;
1806
+ }
1807
+
1808
+ // src/evaluation/providers/codex.ts
1809
+ var execAsync2 = promisify2(execCallback);
1810
+ var WORKSPACE_PREFIX = "agentv-codex-";
1811
+ var PROMPT_FILENAME = "prompt.md";
1812
+ var FILES_DIR = "files";
1813
+ var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
1814
+ var CodexProvider = class {
1815
+ id;
1816
+ kind = "codex";
1817
+ targetName;
1818
+ supportsBatch = false;
1819
+ config;
1820
+ runCodex;
1821
+ environmentCheck;
1822
+ resolvedExecutable;
1823
+ constructor(targetName, config, runner = defaultCodexRunner) {
1824
+ this.id = `codex:${targetName}`;
1825
+ this.targetName = targetName;
1826
+ this.config = config;
1827
+ this.runCodex = runner;
1828
+ }
1829
+ async invoke(request) {
1830
+ if (request.signal?.aborted) {
1831
+ throw new Error("Codex provider request was aborted before execution");
1832
+ }
1833
+ await this.ensureEnvironmentReady();
1834
+ const inputFiles = normalizeInputFiles2(request.inputFiles);
1835
+ const originalGuidelines = new Set(
1836
+ collectGuidelineFiles2(inputFiles, request.guideline_patterns).map((file) => path5.resolve(file))
1837
+ );
1838
+ const workspaceRoot = await this.createWorkspace();
1839
+ try {
1840
+ const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
1841
+ inputFiles,
1842
+ workspaceRoot,
1843
+ originalGuidelines
1844
+ );
1845
+ const promptContent = buildPromptDocument2(request, mirroredInputFiles, {
1846
+ guidelinePatterns: request.guideline_patterns,
1847
+ guidelineOverrides: guidelineMirrors
1848
+ });
1849
+ const promptFile = path5.join(workspaceRoot, PROMPT_FILENAME);
1850
+ await writeFile(promptFile, promptContent, "utf8");
1851
+ const args = this.buildCodexArgs();
1852
+ const cwd = this.resolveCwd(workspaceRoot);
1853
+ const result = await this.executeCodex(args, cwd, promptContent, request.signal);
1854
+ if (result.timedOut) {
1855
+ throw new Error(
1856
+ `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
1857
+ );
1858
+ }
1859
+ if (result.exitCode !== 0) {
1860
+ const detail = pickDetail(result.stderr, result.stdout);
1861
+ const prefix = `Codex CLI exited with code ${result.exitCode}`;
1862
+ throw new Error(detail ? `${prefix}: ${detail}` : prefix);
1863
+ }
1864
+ const parsed = parseCodexJson(result.stdout);
1865
+ const assistantText = extractAssistantText(parsed);
1866
+ return {
1867
+ text: assistantText,
1868
+ raw: {
1869
+ response: parsed,
1870
+ stdout: result.stdout,
1871
+ stderr: result.stderr,
1872
+ exitCode: result.exitCode,
1873
+ args,
1874
+ executable: this.resolvedExecutable ?? this.config.executable,
1875
+ promptFile,
1876
+ workspace: workspaceRoot,
1877
+ inputFiles: mirroredInputFiles
1878
+ }
1879
+ };
1880
+ } finally {
1881
+ await this.cleanupWorkspace(workspaceRoot);
1882
+ }
1883
+ }
1884
+ async ensureEnvironmentReady() {
1885
+ if (!this.environmentCheck) {
1886
+ this.environmentCheck = this.validateEnvironment();
1887
+ }
1888
+ await this.environmentCheck;
1889
+ }
1890
+ async validateEnvironment() {
1891
+ this.resolvedExecutable = await locateExecutable(this.config.executable);
1892
+ }
1893
+ resolveCwd(workspaceRoot) {
1894
+ if (!this.config.cwd) {
1895
+ return workspaceRoot;
1896
+ }
1897
+ return path5.resolve(this.config.cwd);
1898
+ }
1899
+ buildCodexArgs() {
1900
+ const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
1901
+ if (this.config.args && this.config.args.length > 0) {
1902
+ args.push(...this.config.args);
1903
+ }
1904
+ args.push("-");
1905
+ return args;
1906
+ }
1907
+ async executeCodex(args, cwd, promptContent, signal) {
1908
+ try {
1909
+ return await this.runCodex({
1910
+ executable: this.resolvedExecutable ?? this.config.executable,
1911
+ args,
1912
+ cwd,
1913
+ prompt: promptContent,
1914
+ timeoutMs: this.config.timeoutMs,
1915
+ env: process.env,
1916
+ signal
1917
+ });
1918
+ } catch (error) {
1919
+ const err = error;
1920
+ if (err.code === "ENOENT") {
1921
+ throw new Error(
1922
+ `Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
1923
+ );
1924
+ }
1925
+ throw error;
1926
+ }
1927
+ }
1928
+ async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
1929
+ if (!inputFiles || inputFiles.length === 0) {
1930
+ return {
1931
+ mirroredInputFiles: void 0,
1932
+ guidelineMirrors: /* @__PURE__ */ new Set()
1933
+ };
1934
+ }
1935
+ const filesRoot = path5.join(workspaceRoot, FILES_DIR);
1936
+ await mkdir(filesRoot, { recursive: true });
1937
+ const mirrored = [];
1938
+ const guidelineMirrors = /* @__PURE__ */ new Set();
1939
+ const nameCounts = /* @__PURE__ */ new Map();
1940
+ for (const inputFile of inputFiles) {
1941
+ const absoluteSource = path5.resolve(inputFile);
1942
+ const baseName = path5.basename(absoluteSource);
1943
+ const count = nameCounts.get(baseName) ?? 0;
1944
+ nameCounts.set(baseName, count + 1);
1945
+ const finalName = count === 0 ? baseName : `${baseName}.${count}`;
1946
+ const destination = path5.join(filesRoot, finalName);
1947
+ await copyFile(absoluteSource, destination);
1948
+ const resolvedDestination = path5.resolve(destination);
1949
+ mirrored.push(resolvedDestination);
1950
+ if (guidelineOriginals.has(absoluteSource)) {
1951
+ guidelineMirrors.add(resolvedDestination);
1952
+ }
1953
+ }
1954
+ return {
1955
+ mirroredInputFiles: mirrored,
1956
+ guidelineMirrors
1957
+ };
1958
+ }
1959
+ async createWorkspace() {
1960
+ return await mkdtemp(path5.join(tmpdir(), WORKSPACE_PREFIX));
1961
+ }
1962
+ async cleanupWorkspace(workspaceRoot) {
1963
+ try {
1964
+ await rm(workspaceRoot, { recursive: true, force: true });
1965
+ } catch {
1966
+ }
1967
+ }
1968
+ };
1969
+ async function locateExecutable(candidate) {
1970
+ const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
1971
+ if (includesPathSeparator) {
1972
+ const resolved = path5.isAbsolute(candidate) ? candidate : path5.resolve(candidate);
1973
+ const executablePath = await ensureWindowsExecutableVariant(resolved);
1974
+ await access2(executablePath, constants2.F_OK);
1975
+ return executablePath;
1976
+ }
1977
+ const locator = process.platform === "win32" ? "where" : "which";
1978
+ try {
1979
+ const { stdout } = await execAsync2(`${locator} ${candidate}`);
1980
+ const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
1981
+ const preferred = selectExecutableCandidate(lines);
1982
+ if (preferred) {
1983
+ const executablePath = await ensureWindowsExecutableVariant(preferred);
1984
+ await access2(executablePath, constants2.F_OK);
1985
+ return executablePath;
1986
+ }
1987
+ } catch {
1988
+ }
1989
+ throw new Error(`Codex executable '${candidate}' was not found on PATH`);
1990
+ }
1991
+ function selectExecutableCandidate(candidates) {
1992
+ if (candidates.length === 0) {
1993
+ return void 0;
1994
+ }
1995
+ if (process.platform !== "win32") {
1996
+ return candidates[0];
1997
+ }
1998
+ const extensions = getWindowsExecutableExtensions();
1999
+ for (const ext of extensions) {
2000
+ const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
2001
+ if (match) {
2002
+ return match;
2003
+ }
2004
+ }
2005
+ return candidates[0];
2006
+ }
2007
+ async function ensureWindowsExecutableVariant(candidate) {
2008
+ if (process.platform !== "win32") {
2009
+ return candidate;
2010
+ }
2011
+ if (hasExecutableExtension(candidate)) {
2012
+ return candidate;
2013
+ }
2014
+ const extensions = getWindowsExecutableExtensions();
2015
+ for (const ext of extensions) {
2016
+ const withExtension = `${candidate}${ext}`;
2017
+ try {
2018
+ await access2(withExtension, constants2.F_OK);
2019
+ return withExtension;
2020
+ } catch {
2021
+ }
2022
+ }
2023
+ return candidate;
2024
+ }
2025
+ function hasExecutableExtension(candidate) {
2026
+ const lower = candidate.toLowerCase();
2027
+ return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
2028
+ }
2029
+ var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
2030
+ function getWindowsExecutableExtensions() {
2031
+ if (process.platform !== "win32") {
2032
+ return [];
2033
+ }
2034
+ const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
2035
+ return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
2036
+ }
2037
+ function parseCodexJson(output) {
2038
+ const trimmed = output.trim();
2039
+ if (trimmed.length === 0) {
2040
+ throw new Error("Codex CLI produced no output in --json mode");
2041
+ }
2042
+ try {
2043
+ return JSON.parse(trimmed);
2044
+ } catch {
2045
+ const lineObjects = parseJsonLines(trimmed);
2046
+ if (lineObjects) {
2047
+ return lineObjects;
2048
+ }
2049
+ const lastBrace = trimmed.lastIndexOf("{");
2050
+ if (lastBrace >= 0) {
2051
+ const candidate = trimmed.slice(lastBrace);
2052
+ try {
2053
+ return JSON.parse(candidate);
2054
+ } catch {
2055
+ }
2056
+ }
2057
+ const preview = trimmed.slice(0, 200);
2058
+ throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
2059
+ }
2060
+ }
2061
+ function extractAssistantText(parsed) {
2062
+ if (Array.isArray(parsed)) {
2063
+ const text = extractFromEventStream(parsed);
2064
+ if (text) {
2065
+ return text;
2066
+ }
2067
+ }
2068
+ if (!parsed || typeof parsed !== "object") {
2069
+ throw new Error("Codex CLI JSON response did not include an assistant message");
2070
+ }
2071
+ const record = parsed;
2072
+ const eventText = extractFromEvent(record);
2073
+ if (eventText) {
2074
+ return eventText;
2075
+ }
2076
+ const messages = Array.isArray(record.messages) ? record.messages : void 0;
2077
+ if (messages) {
2078
+ for (let index = messages.length - 1; index >= 0; index -= 1) {
2079
+ const entry = messages[index];
2080
+ if (!entry || typeof entry !== "object") {
2081
+ continue;
2082
+ }
2083
+ const role = entry.role;
2084
+ if (role !== "assistant") {
2085
+ continue;
2086
+ }
2087
+ const content = entry.content;
2088
+ const flattened = flattenContent(content);
2089
+ if (flattened) {
2090
+ return flattened;
2091
+ }
2092
+ }
2093
+ }
2094
+ const response = record.response;
2095
+ if (response && typeof response === "object") {
2096
+ const content = response.content;
2097
+ const flattened = flattenContent(content);
2098
+ if (flattened) {
2099
+ return flattened;
2100
+ }
2101
+ }
2102
+ const output = record.output;
2103
+ const flattenedOutput = flattenContent(output);
2104
+ if (flattenedOutput) {
2105
+ return flattenedOutput;
2106
+ }
2107
+ throw new Error("Codex CLI JSON response did not include an assistant message");
2108
+ }
2109
+ function extractFromEventStream(events) {
2110
+ for (let index = events.length - 1; index >= 0; index -= 1) {
2111
+ const candidate = events[index];
2112
+ const text = extractFromEvent(candidate);
2113
+ if (text) {
2114
+ return text;
2115
+ }
2116
+ }
2117
+ return void 0;
2118
+ }
2119
+ function extractFromEvent(event) {
2120
+ if (!event || typeof event !== "object") {
2121
+ return void 0;
2122
+ }
2123
+ const record = event;
2124
+ const type = typeof record.type === "string" ? record.type : void 0;
2125
+ if (type === JSONL_TYPE_ITEM_COMPLETED) {
2126
+ const item = record.item;
2127
+ const text = extractFromItem(item);
2128
+ if (text) {
2129
+ return text;
2130
+ }
2131
+ }
2132
+ const output = record.output ?? record.content;
2133
+ const flattened = flattenContent(output);
2134
+ if (flattened) {
2135
+ return flattened;
2136
+ }
2137
+ return void 0;
2138
+ }
2139
+ function extractFromItem(item) {
2140
+ if (!item || typeof item !== "object") {
2141
+ return void 0;
2142
+ }
2143
+ const record = item;
2144
+ const itemType = typeof record.type === "string" ? record.type : void 0;
2145
+ if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
2146
+ const text = flattenContent(record.text ?? record.content ?? record.output);
2147
+ if (text) {
2148
+ return text;
2149
+ }
2150
+ }
2151
+ return void 0;
2152
+ }
2153
+ function flattenContent(value) {
2154
+ if (typeof value === "string") {
2155
+ return value;
2156
+ }
2157
+ if (Array.isArray(value)) {
2158
+ const parts = value.map((segment) => {
2159
+ if (typeof segment === "string") {
2160
+ return segment;
2161
+ }
2162
+ if (segment && typeof segment === "object" && "text" in segment) {
2163
+ const text = segment.text;
2164
+ return typeof text === "string" ? text : void 0;
2165
+ }
2166
+ return void 0;
2167
+ }).filter((part) => typeof part === "string" && part.length > 0);
2168
+ return parts.length > 0 ? parts.join(" \n") : void 0;
2169
+ }
2170
+ if (value && typeof value === "object" && "text" in value) {
2171
+ const text = value.text;
2172
+ return typeof text === "string" ? text : void 0;
2173
+ }
2174
+ return void 0;
2175
+ }
2176
+ function parseJsonLines(output) {
2177
+ const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2178
+ if (lines.length <= 1) {
2179
+ return void 0;
2180
+ }
2181
+ const parsed = [];
2182
+ for (const line of lines) {
2183
+ try {
2184
+ parsed.push(JSON.parse(line));
2185
+ } catch {
2186
+ return void 0;
2187
+ }
2188
+ }
2189
+ return parsed;
2190
+ }
2191
+ function pickDetail(stderr, stdout) {
2192
+ const errorText = stderr.trim();
2193
+ if (errorText.length > 0) {
2194
+ return errorText;
2195
+ }
2196
+ const stdoutText = stdout.trim();
2197
+ return stdoutText.length > 0 ? stdoutText : void 0;
2198
+ }
2199
+ function formatTimeoutSuffix2(timeoutMs) {
2200
+ if (!timeoutMs || timeoutMs <= 0) {
2201
+ return "";
2202
+ }
2203
+ const seconds = Math.ceil(timeoutMs / 1e3);
2204
+ return ` after ${seconds}s`;
2205
+ }
2206
+ async function defaultCodexRunner(options) {
2207
+ return await new Promise((resolve, reject) => {
2208
+ const child = spawn(options.executable, options.args, {
2209
+ cwd: options.cwd,
2210
+ env: options.env,
2211
+ stdio: ["pipe", "pipe", "pipe"],
2212
+ shell: shouldShellExecute(options.executable)
2213
+ });
2214
+ let stdout = "";
2215
+ let stderr = "";
2216
+ let timedOut = false;
2217
+ const onAbort = () => {
2218
+ child.kill("SIGTERM");
2219
+ };
2220
+ if (options.signal) {
2221
+ if (options.signal.aborted) {
2222
+ onAbort();
2223
+ } else {
2224
+ options.signal.addEventListener("abort", onAbort, { once: true });
2225
+ }
2226
+ }
2227
+ let timeoutHandle;
2228
+ if (options.timeoutMs && options.timeoutMs > 0) {
2229
+ timeoutHandle = setTimeout(() => {
2230
+ timedOut = true;
2231
+ child.kill("SIGTERM");
2232
+ }, options.timeoutMs);
2233
+ timeoutHandle.unref?.();
2234
+ }
2235
+ child.stdout.setEncoding("utf8");
2236
+ child.stdout.on("data", (chunk) => {
2237
+ stdout += chunk;
2238
+ });
2239
+ child.stderr.setEncoding("utf8");
2240
+ child.stderr.on("data", (chunk) => {
2241
+ stderr += chunk;
2242
+ });
2243
+ child.stdin.end(options.prompt);
2244
+ const cleanup = () => {
2245
+ if (timeoutHandle) {
2246
+ clearTimeout(timeoutHandle);
1101
2247
  }
1102
- if (result.skippedExisting.length > 0) {
1103
- console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
2248
+ if (options.signal) {
2249
+ options.signal.removeEventListener("abort", onAbort);
1104
2250
  }
1105
- console.log(`
1106
- total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
1107
- }
1108
- return {
1109
- provisioned: true,
1110
- message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
1111
- };
1112
- } catch (error) {
1113
- const errorMessage = error instanceof Error ? error.message : String(error);
1114
- if (verbose) {
1115
- console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
1116
- }
1117
- return {
1118
- provisioned: false,
1119
- message: `Provisioning failed: ${errorMessage}`
1120
2251
  };
2252
+ child.on("error", (error) => {
2253
+ cleanup();
2254
+ reject(error);
2255
+ });
2256
+ child.on("close", (code) => {
2257
+ cleanup();
2258
+ resolve({
2259
+ stdout,
2260
+ stderr,
2261
+ exitCode: typeof code === "number" ? code : -1,
2262
+ timedOut
2263
+ });
2264
+ });
2265
+ });
2266
+ }
2267
+ function shouldShellExecute(executable) {
2268
+ if (process.platform !== "win32") {
2269
+ return false;
1121
2270
  }
2271
+ const lower = executable.toLowerCase();
2272
+ return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
1122
2273
  }
1123
2274
 
1124
2275
  // src/evaluation/providers/targets-file.ts
1125
- import { constants as constants2 } from "node:fs";
1126
- import { access as access2, readFile as readFile3 } from "node:fs/promises";
1127
- import path3 from "node:path";
2276
+ import { constants as constants3 } from "node:fs";
2277
+ import { access as access3, readFile as readFile3 } from "node:fs/promises";
2278
+ import path6 from "node:path";
1128
2279
  import { parse as parse2 } from "yaml";
1129
2280
  function isRecord(value) {
1130
2281
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -1180,14 +2331,14 @@ function assertTargetDefinition(value, index, filePath) {
1180
2331
  }
1181
2332
  async function fileExists3(filePath) {
1182
2333
  try {
1183
- await access2(filePath, constants2.F_OK);
2334
+ await access3(filePath, constants3.F_OK);
1184
2335
  return true;
1185
2336
  } catch {
1186
2337
  return false;
1187
2338
  }
1188
2339
  }
1189
2340
  async function readTargetDefinitions(filePath) {
1190
- const absolutePath = path3.resolve(filePath);
2341
+ const absolutePath = path6.resolve(filePath);
1191
2342
  if (!await fileExists3(absolutePath)) {
1192
2343
  throw new Error(`targets.yaml not found at ${absolutePath}`);
1193
2344
  }
@@ -1214,6 +2365,10 @@ function createProvider(target) {
1214
2365
  return new AnthropicProvider(target.name, target.config);
1215
2366
  case "gemini":
1216
2367
  return new GeminiProvider(target.name, target.config);
2368
+ case "cli":
2369
+ return new CliProvider(target.name, target.config);
2370
+ case "codex":
2371
+ return new CodexProvider(target.name, target.config);
1217
2372
  case "mock":
1218
2373
  return new MockProvider(target.name, target.config);
1219
2374
  case "vscode":
@@ -1230,230 +2385,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
1230
2385
  return createProvider(resolved);
1231
2386
  }
1232
2387
 
1233
- // src/evaluation/scoring.ts
1234
- var KEY_TERM_MATCH_THRESHOLD = 0.5;
1235
- var ACTION_WORDS = /* @__PURE__ */ new Set([
1236
- "use",
1237
- "avoid",
1238
- "prefer",
1239
- "replace",
1240
- "consider",
1241
- "ensure",
1242
- "remove",
1243
- "add"
1244
- ]);
1245
- var STOP_WORDS = /* @__PURE__ */ new Set([
1246
- "the",
1247
- "a",
1248
- "an",
1249
- "and",
1250
- "or",
1251
- "but",
1252
- "in",
1253
- "on",
1254
- "at",
1255
- "to",
1256
- "for",
1257
- "of",
1258
- "with",
1259
- "by",
1260
- "is",
1261
- "are",
1262
- "was",
1263
- "were",
1264
- "be",
1265
- "been",
1266
- "being",
1267
- "have",
1268
- "has",
1269
- "had",
1270
- "do",
1271
- "does",
1272
- "did",
1273
- "will",
1274
- "would",
1275
- "could",
1276
- "should"
1277
- ]);
1278
- var ERROR_PREFIXES = [
1279
- "error:",
1280
- "err:",
1281
- "vs code command failed",
1282
- "exception",
1283
- "traceback",
1284
- "no response file was generated",
1285
- "timed out",
1286
- "cli not found"
1287
- ];
1288
- function extractAspects(expectedResponse) {
1289
- const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
1290
- const aspects = [];
1291
- for (const line of lines) {
1292
- if (line.length === 0) {
1293
- continue;
1294
- }
1295
- const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
1296
- if (bulletMatch) {
1297
- const normalized = normalizeAspect(bulletMatch[2]);
1298
- if (normalized.length > 0) {
1299
- aspects.push(normalized);
1300
- }
1301
- continue;
1302
- }
1303
- const lowered = line.toLowerCase();
1304
- if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
1305
- const normalized = normalizeAspect(line);
1306
- if (normalized.length > 0) {
1307
- aspects.push(normalized);
1308
- }
1309
- }
1310
- }
1311
- return aspects;
1312
- }
1313
- function calculateHits(candidateResponse, expectedAspects) {
1314
- const { normalizedText, words } = normalizeCandidate(candidateResponse);
1315
- const hits = [];
1316
- for (const aspect of expectedAspects) {
1317
- if (matchesAspect(aspect, normalizedText, words)) {
1318
- hits.push(aspect);
1319
- }
1320
- }
1321
- return hits;
1322
- }
1323
- function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
1324
- const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
1325
- return expectedAspects.filter((aspect) => !hits.has(aspect));
1326
- }
1327
- function scoreCandidateResponse(candidateResponse, expectedAspects) {
1328
- if (expectedAspects.length === 0) {
1329
- if (isErrorLike(candidateResponse)) {
1330
- return {
1331
- score: 0,
1332
- hits: [],
1333
- misses: ["Model produced an error instead of an answer."],
1334
- hitCount: 0,
1335
- totalAspects: 0,
1336
- rawAspects: []
1337
- };
1338
- }
1339
- return {
1340
- score: 1,
1341
- hits: [],
1342
- misses: [],
1343
- hitCount: 0,
1344
- totalAspects: 0,
1345
- rawAspects: []
1346
- };
1347
- }
1348
- const hits = calculateHits(candidateResponse, expectedAspects);
1349
- const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
1350
- const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
1351
- return {
1352
- score,
1353
- hits,
1354
- misses,
1355
- hitCount: hits.length,
1356
- totalAspects: expectedAspects.length,
1357
- rawAspects: expectedAspects
1358
- };
1359
- }
1360
- function isErrorLike(text) {
1361
- if (!text) {
1362
- return false;
1363
- }
1364
- const lowered = text.trim().toLowerCase();
1365
- return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
1366
- }
1367
- function normalizeAspect(aspect) {
1368
- const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
1369
- return sanitized;
1370
- }
1371
- function normalizeCandidate(candidate) {
1372
- const lowered = candidate.toLowerCase();
1373
- const normalizedText = lowered.replace(/[^\w\s]/g, " ");
1374
- const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
1375
- return { normalizedText, words };
1376
- }
1377
- function matchesAspect(aspect, candidateNormalized, candidateWords) {
1378
- const keyTerms = extractKeyTerms(aspect);
1379
- if (keyTerms.length === 0) {
1380
- return false;
1381
- }
1382
- const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
1383
- const ratio = matches / keyTerms.length;
1384
- if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
1385
- return true;
1386
- }
1387
- const aspectWords = aspect.split(" ");
1388
- if (aspectWords.length >= 2) {
1389
- for (let index = 0; index < aspectWords.length - 1; index += 1) {
1390
- const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
1391
- if (candidateNormalized.includes(phrase)) {
1392
- return true;
1393
- }
1394
- }
1395
- }
1396
- return false;
1397
- }
1398
- function extractKeyTerms(aspect, maxTerms = 5) {
1399
- const terms = [];
1400
- const words = aspect.split(" ");
1401
- for (const word of words) {
1402
- if (word.length <= 2) {
1403
- continue;
1404
- }
1405
- if (STOP_WORDS.has(word)) {
1406
- continue;
1407
- }
1408
- terms.push(word);
1409
- if (terms.length >= maxTerms) {
1410
- break;
1411
- }
1412
- }
1413
- return terms;
1414
- }
1415
-
1416
- // src/evaluation/grading.ts
2388
+ // src/evaluation/evaluators.ts
1417
2389
  import { randomUUID } from "node:crypto";
1418
- var HeuristicGrader = class {
1419
- kind = "heuristic";
1420
- grade(context) {
1421
- const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1422
- const result = scoreCandidateResponse(context.candidate, expectedAspects);
1423
- const misses = [...result.misses];
1424
- if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
1425
- const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
1426
- if (firstLine && !misses.includes(firstLine)) {
1427
- misses.unshift(firstLine);
1428
- }
1429
- }
1430
- return {
1431
- score: result.score,
1432
- hits: result.hits,
1433
- misses,
1434
- expectedAspectCount: result.totalAspects,
1435
- rawAspects: result.rawAspects
1436
- };
1437
- }
1438
- };
1439
- var QualityGrader = class {
2390
+ var LlmJudgeEvaluator = class {
1440
2391
  kind = "llm_judge";
1441
2392
  resolveJudgeProvider;
1442
2393
  maxOutputTokens;
1443
2394
  temperature;
2395
+ customPrompt;
1444
2396
  constructor(options) {
1445
2397
  this.resolveJudgeProvider = options.resolveJudgeProvider;
1446
2398
  this.maxOutputTokens = options.maxOutputTokens;
1447
2399
  this.temperature = options.temperature;
2400
+ this.customPrompt = options.customPrompt;
1448
2401
  }
1449
- async grade(context) {
2402
+ async evaluate(context) {
1450
2403
  const judgeProvider = await this.resolveJudgeProvider(context);
1451
2404
  if (!judgeProvider) {
1452
2405
  throw new Error("No judge provider available for LLM grading");
1453
2406
  }
1454
2407
  const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2408
+ const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
1455
2409
  const metadata = {
1456
- systemPrompt: QUALITY_SYSTEM_PROMPT
2410
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2411
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1457
2412
  };
1458
2413
  const response = await judgeProvider.invoke({
1459
2414
  prompt,
@@ -1468,12 +2423,13 @@ var QualityGrader = class {
1468
2423
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
1469
2424
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
1470
2425
  const reasoning = parsed.reasoning ?? response.reasoning;
1471
- const graderRawRequest = {
2426
+ const evaluatorRawRequest = {
1472
2427
  id: randomUUID(),
1473
2428
  provider: judgeProvider.id,
1474
2429
  prompt,
1475
- systemPrompt: QUALITY_SYSTEM_PROMPT,
1476
- target: context.target.name
2430
+ target: context.target.name,
2431
+ ...systemPrompt !== void 0 ? { systemPrompt } : {},
2432
+ ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
1477
2433
  };
1478
2434
  return {
1479
2435
  score,
@@ -1481,7 +2437,7 @@ var QualityGrader = class {
1481
2437
  misses,
1482
2438
  expectedAspectCount: hits.length + misses.length || 1,
1483
2439
  reasoning,
1484
- graderRawRequest
2440
+ evaluatorRawRequest
1485
2441
  };
1486
2442
  }
1487
2443
  };
@@ -1599,11 +2555,117 @@ function extractJsonBlob(text) {
1599
2555
  function isNonEmptyString(value) {
1600
2556
  return typeof value === "string" && value.trim().length > 0;
1601
2557
  }
2558
+ var CodeEvaluator = class {
2559
+ kind = "code";
2560
+ script;
2561
+ cwd;
2562
+ agentTimeoutMs;
2563
+ constructor(options) {
2564
+ this.script = options.script;
2565
+ this.cwd = options.cwd;
2566
+ this.agentTimeoutMs = options.agentTimeoutMs;
2567
+ }
2568
+ async evaluate(context) {
2569
+ const inputPayload = JSON.stringify(
2570
+ {
2571
+ task: context.evalCase.task,
2572
+ outcome: context.evalCase.outcome,
2573
+ expected: context.evalCase.expected_assistant_raw,
2574
+ output: context.candidate,
2575
+ system_message: context.promptInputs.systemMessage ?? "",
2576
+ guideline_paths: context.evalCase.guideline_paths,
2577
+ attachments: context.evalCase.file_paths,
2578
+ user_segments: context.evalCase.user_segments
2579
+ },
2580
+ null,
2581
+ 2
2582
+ );
2583
+ try {
2584
+ const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
2585
+ const parsed = parseJsonSafe(stdout);
2586
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
2587
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
2588
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
2589
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
2590
+ return {
2591
+ score,
2592
+ hits,
2593
+ misses,
2594
+ expectedAspectCount: hits.length + misses.length || 1,
2595
+ reasoning,
2596
+ evaluatorRawRequest: {
2597
+ script: this.script,
2598
+ ...this.cwd ? { cwd: this.cwd } : {}
2599
+ }
2600
+ };
2601
+ } catch (error) {
2602
+ const message = error instanceof Error ? error.message : String(error);
2603
+ return {
2604
+ score: 0,
2605
+ hits: [],
2606
+ misses: [`Code evaluator failed: ${message}`],
2607
+ expectedAspectCount: 1,
2608
+ reasoning: message,
2609
+ evaluatorRawRequest: {
2610
+ script: this.script,
2611
+ ...this.cwd ? { cwd: this.cwd } : {},
2612
+ error: message
2613
+ }
2614
+ };
2615
+ }
2616
+ }
2617
+ };
2618
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
2619
+ const { spawn: spawn2 } = await import("node:child_process");
2620
+ return await new Promise((resolve, reject) => {
2621
+ const child = spawn2(scriptPath, {
2622
+ shell: true,
2623
+ cwd
2624
+ });
2625
+ let stdout = "";
2626
+ let stderr = "";
2627
+ const timeout = agentTimeoutMs ? setTimeout(() => {
2628
+ child.kill();
2629
+ reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
2630
+ }, agentTimeoutMs) : void 0;
2631
+ child.stdout?.on("data", (data) => {
2632
+ stdout += data.toString();
2633
+ });
2634
+ child.stderr?.on("data", (data) => {
2635
+ stderr += data.toString();
2636
+ });
2637
+ child.on("error", (error) => {
2638
+ if (timeout !== void 0) {
2639
+ clearTimeout(timeout);
2640
+ }
2641
+ reject(error);
2642
+ });
2643
+ child.on("exit", (code) => {
2644
+ if (timeout !== void 0) {
2645
+ clearTimeout(timeout);
2646
+ }
2647
+ if (code && code !== 0 && stderr.length > 0) {
2648
+ reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
2649
+ return;
2650
+ }
2651
+ resolve(stdout.trim());
2652
+ });
2653
+ child.stdin?.write(input);
2654
+ child.stdin?.end();
2655
+ });
2656
+ }
2657
+ function parseJsonSafe(payload) {
2658
+ try {
2659
+ return JSON.parse(payload);
2660
+ } catch {
2661
+ return void 0;
2662
+ }
2663
+ }
1602
2664
 
1603
2665
  // src/evaluation/orchestrator.ts
1604
2666
  import { createHash, randomUUID as randomUUID2 } from "node:crypto";
1605
- import { mkdir, writeFile as writeFile2 } from "node:fs/promises";
1606
- import path4 from "node:path";
2667
+ import { mkdir as mkdir2, readFile as readFile4, writeFile as writeFile2 } from "node:fs/promises";
2668
+ import path7 from "node:path";
1607
2669
 
1608
2670
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
1609
2671
  var Node = class {
@@ -1750,7 +2812,7 @@ async function runEvaluation(options) {
1750
2812
  targets,
1751
2813
  env,
1752
2814
  providerFactory,
1753
- graders,
2815
+ evaluators,
1754
2816
  maxRetries,
1755
2817
  agentTimeoutMs,
1756
2818
  promptDumpDir,
@@ -1809,8 +2871,14 @@ async function runEvaluation(options) {
1809
2871
  }
1810
2872
  return getOrCreateProvider(resolvedJudge);
1811
2873
  };
1812
- const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
2874
+ const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
1813
2875
  const primaryProvider = getOrCreateProvider(target);
2876
+ const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
2877
+ if (target.providerBatching && !providerSupportsBatch && verbose) {
2878
+ console.warn(
2879
+ `Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
2880
+ );
2881
+ }
1814
2882
  if (onProgress && filteredEvalCases.length > 0) {
1815
2883
  for (let i = 0; i < filteredEvalCases.length; i++) {
1816
2884
  await onProgress({
@@ -1820,6 +2888,28 @@ async function runEvaluation(options) {
1820
2888
  });
1821
2889
  }
1822
2890
  }
2891
+ if (providerSupportsBatch) {
2892
+ try {
2893
+ return await runBatchEvaluation({
2894
+ evalCases: filteredEvalCases,
2895
+ provider: primaryProvider,
2896
+ target,
2897
+ evaluatorRegistry,
2898
+ promptDumpDir,
2899
+ nowFn: now ?? (() => /* @__PURE__ */ new Date()),
2900
+ onProgress,
2901
+ onResult,
2902
+ verbose,
2903
+ resolveJudgeProvider,
2904
+ agentTimeoutMs
2905
+ });
2906
+ } catch (error) {
2907
+ if (verbose) {
2908
+ const message = error instanceof Error ? error.message : String(error);
2909
+ console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
2910
+ }
2911
+ }
2912
+ }
1823
2913
  const workers = options.maxConcurrency ?? target.workers ?? 1;
1824
2914
  const limit = pLimit(workers);
1825
2915
  let nextWorkerId = 1;
@@ -1842,7 +2932,7 @@ async function runEvaluation(options) {
1842
2932
  evalCase,
1843
2933
  provider: primaryProvider,
1844
2934
  target,
1845
- graders: graderRegistry,
2935
+ evaluators: evaluatorRegistry,
1846
2936
  maxRetries,
1847
2937
  agentTimeoutMs,
1848
2938
  promptDumpDir,
@@ -1903,12 +2993,118 @@ async function runEvaluation(options) {
1903
2993
  }
1904
2994
  return results;
1905
2995
  }
2996
+ async function runBatchEvaluation(options) {
2997
+ const {
2998
+ evalCases,
2999
+ provider,
3000
+ target,
3001
+ evaluatorRegistry,
3002
+ promptDumpDir,
3003
+ nowFn,
3004
+ onProgress,
3005
+ onResult,
3006
+ resolveJudgeProvider,
3007
+ agentTimeoutMs
3008
+ } = options;
3009
+ const promptInputsList = [];
3010
+ for (const evalCase of evalCases) {
3011
+ const promptInputs = await buildPromptInputs(evalCase);
3012
+ if (promptDumpDir) {
3013
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
3014
+ }
3015
+ promptInputsList.push(promptInputs);
3016
+ }
3017
+ const batchRequests = evalCases.map((evalCase, index) => {
3018
+ const promptInputs = promptInputsList[index];
3019
+ return {
3020
+ prompt: promptInputs.request,
3021
+ guidelines: promptInputs.guidelines,
3022
+ guideline_patterns: evalCase.guideline_patterns,
3023
+ inputFiles: evalCase.file_paths,
3024
+ evalCaseId: evalCase.id,
3025
+ metadata: {
3026
+ systemPrompt: promptInputs.systemMessage ?? ""
3027
+ }
3028
+ };
3029
+ });
3030
+ const batchResponse = await provider.invokeBatch?.(batchRequests);
3031
+ if (!Array.isArray(batchResponse)) {
3032
+ throw new Error("Provider batching failed: invokeBatch did not return an array");
3033
+ }
3034
+ if (batchResponse.length !== evalCases.length) {
3035
+ throw new Error(
3036
+ `Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
3037
+ );
3038
+ }
3039
+ if (onProgress) {
3040
+ const startedAt = Date.now();
3041
+ for (let i = 0; i < evalCases.length; i++) {
3042
+ await onProgress({
3043
+ workerId: 1,
3044
+ evalId: evalCases[i].id,
3045
+ status: "running",
3046
+ startedAt
3047
+ });
3048
+ }
3049
+ }
3050
+ const results = [];
3051
+ for (let i = 0; i < evalCases.length; i++) {
3052
+ const evalCase = evalCases[i];
3053
+ const promptInputs = promptInputsList[i];
3054
+ const providerResponse = batchResponse[i];
3055
+ let result;
3056
+ try {
3057
+ result = await evaluateCandidate({
3058
+ evalCase,
3059
+ candidate: providerResponse.text ?? "",
3060
+ target,
3061
+ provider,
3062
+ evaluators: evaluatorRegistry,
3063
+ promptInputs,
3064
+ nowFn,
3065
+ attempt: 0,
3066
+ judgeProvider: await resolveJudgeProvider(target),
3067
+ agentTimeoutMs
3068
+ });
3069
+ } catch (error) {
3070
+ const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3071
+ results.push(errorResult);
3072
+ if (onResult) {
3073
+ await onResult(errorResult);
3074
+ }
3075
+ if (onProgress) {
3076
+ await onProgress({
3077
+ workerId: 1,
3078
+ evalId: evalCase.id,
3079
+ status: "failed",
3080
+ completedAt: Date.now(),
3081
+ error: error instanceof Error ? error.message : String(error)
3082
+ });
3083
+ }
3084
+ continue;
3085
+ }
3086
+ results.push(result);
3087
+ if (onResult) {
3088
+ await onResult(result);
3089
+ }
3090
+ if (onProgress) {
3091
+ await onProgress({
3092
+ workerId: 1,
3093
+ evalId: evalCase.id,
3094
+ status: "completed",
3095
+ startedAt: 0,
3096
+ completedAt: Date.now()
3097
+ });
3098
+ }
3099
+ }
3100
+ return results;
3101
+ }
1906
3102
  async function runEvalCase(options) {
1907
3103
  const {
1908
3104
  evalCase,
1909
3105
  provider,
1910
3106
  target,
1911
- graders,
3107
+ evaluators,
1912
3108
  now,
1913
3109
  maxRetries,
1914
3110
  agentTimeoutMs,
@@ -1963,27 +3159,49 @@ async function runEvalCase(options) {
1963
3159
  if (cacheKey && cache && !cachedResponse) {
1964
3160
  await cache.set(cacheKey, providerResponse);
1965
3161
  }
1966
- const graderKind = evalCase.grader ?? "heuristic";
1967
- const activeGrader = graders[graderKind] ?? graders.heuristic;
1968
- if (!activeGrader) {
1969
- throw new Error(`No grader registered for kind '${graderKind}'`);
1970
- }
1971
- let grade;
1972
3162
  try {
1973
- const gradeTimestamp = nowFn();
1974
- grade = await activeGrader.grade({
3163
+ return await evaluateCandidate({
1975
3164
  evalCase,
1976
3165
  candidate: providerResponse.text ?? "",
1977
3166
  target,
1978
3167
  provider,
1979
- attempt,
3168
+ evaluators,
1980
3169
  promptInputs,
1981
- now: gradeTimestamp,
1982
- judgeProvider
3170
+ nowFn,
3171
+ attempt,
3172
+ judgeProvider,
3173
+ agentTimeoutMs
1983
3174
  });
1984
3175
  } catch (error) {
1985
3176
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
1986
3177
  }
3178
+ }
3179
+ async function evaluateCandidate(options) {
3180
+ const {
3181
+ evalCase,
3182
+ candidate,
3183
+ target,
3184
+ provider,
3185
+ evaluators,
3186
+ promptInputs,
3187
+ nowFn,
3188
+ attempt,
3189
+ judgeProvider,
3190
+ agentTimeoutMs
3191
+ } = options;
3192
+ const gradeTimestamp = nowFn();
3193
+ const { score, evaluatorResults } = await runEvaluatorsForCase({
3194
+ evalCase,
3195
+ candidate,
3196
+ target,
3197
+ provider,
3198
+ evaluators,
3199
+ attempt,
3200
+ promptInputs,
3201
+ now: gradeTimestamp,
3202
+ judgeProvider,
3203
+ agentTimeoutMs
3204
+ });
1987
3205
  const completedAt = nowFn();
1988
3206
  const rawRequest = {
1989
3207
  request: promptInputs.request,
@@ -1994,28 +3212,200 @@ async function runEvalCase(options) {
1994
3212
  return {
1995
3213
  eval_id: evalCase.id,
1996
3214
  conversation_id: evalCase.conversation_id,
1997
- score: grade.score,
1998
- hits: grade.hits,
1999
- misses: grade.misses,
2000
- model_answer: providerResponse.text ?? "",
2001
- expected_aspect_count: grade.expectedAspectCount,
3215
+ score: score.score,
3216
+ hits: score.hits,
3217
+ misses: score.misses,
3218
+ model_answer: candidate,
3219
+ expected_aspect_count: score.expectedAspectCount,
2002
3220
  target: target.name,
2003
3221
  timestamp: completedAt.toISOString(),
2004
- reasoning: grade.reasoning,
2005
- raw_aspects: grade.rawAspects,
3222
+ reasoning: score.reasoning,
3223
+ raw_aspects: score.rawAspects,
2006
3224
  raw_request: rawRequest,
2007
- grader_raw_request: grade.graderRawRequest
3225
+ evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3226
+ evaluator_results: evaluatorResults
2008
3227
  };
2009
3228
  }
3229
+ async function runEvaluatorsForCase(options) {
3230
+ const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
3231
+ if (evalCase.evaluators && evalCase.evaluators.length > 0) {
3232
+ return runEvaluatorList({
3233
+ evalCase,
3234
+ evaluators: evalCase.evaluators,
3235
+ candidate,
3236
+ target,
3237
+ provider,
3238
+ evaluatorRegistry: evaluators,
3239
+ attempt,
3240
+ promptInputs,
3241
+ now,
3242
+ judgeProvider,
3243
+ agentTimeoutMs
3244
+ });
3245
+ }
3246
+ const evaluatorKind = evalCase.evaluator ?? "llm_judge";
3247
+ const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
3248
+ if (!activeEvaluator) {
3249
+ throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
3250
+ }
3251
+ const score = await activeEvaluator.evaluate({
3252
+ evalCase,
3253
+ candidate,
3254
+ target,
3255
+ provider,
3256
+ attempt,
3257
+ promptInputs,
3258
+ now,
3259
+ judgeProvider
3260
+ });
3261
+ return { score };
3262
+ }
3263
+ async function runEvaluatorList(options) {
3264
+ const {
3265
+ evalCase,
3266
+ evaluators,
3267
+ candidate,
3268
+ target,
3269
+ provider,
3270
+ evaluatorRegistry,
3271
+ attempt,
3272
+ promptInputs,
3273
+ now,
3274
+ judgeProvider,
3275
+ agentTimeoutMs
3276
+ } = options;
3277
+ const scored = [];
3278
+ const evaluatorResults = [];
3279
+ for (const evaluator of evaluators ?? []) {
3280
+ try {
3281
+ if (evaluator.type === "llm_judge") {
3282
+ const score2 = await runLlmJudgeEvaluator({
3283
+ config: evaluator,
3284
+ evalCase,
3285
+ candidate,
3286
+ target,
3287
+ provider,
3288
+ evaluatorRegistry,
3289
+ attempt,
3290
+ promptInputs,
3291
+ now,
3292
+ judgeProvider
3293
+ });
3294
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3295
+ evaluatorResults.push({
3296
+ name: evaluator.name,
3297
+ type: evaluator.type,
3298
+ score: score2.score,
3299
+ hits: score2.hits,
3300
+ misses: score2.misses,
3301
+ reasoning: score2.reasoning,
3302
+ evaluator_raw_request: score2.evaluatorRawRequest
3303
+ });
3304
+ continue;
3305
+ }
3306
+ if (evaluator.type === "code") {
3307
+ const codeEvaluator = new CodeEvaluator({
3308
+ script: evaluator.script,
3309
+ cwd: evaluator.resolvedCwd ?? evaluator.cwd,
3310
+ agentTimeoutMs
3311
+ });
3312
+ const score2 = await codeEvaluator.evaluate({
3313
+ evalCase,
3314
+ candidate,
3315
+ target,
3316
+ provider,
3317
+ attempt,
3318
+ promptInputs,
3319
+ now
3320
+ });
3321
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
3322
+ evaluatorResults.push({
3323
+ name: evaluator.name,
3324
+ type: evaluator.type,
3325
+ score: score2.score,
3326
+ hits: score2.hits,
3327
+ misses: score2.misses,
3328
+ reasoning: score2.reasoning,
3329
+ evaluator_raw_request: score2.evaluatorRawRequest
3330
+ });
3331
+ continue;
3332
+ }
3333
+ } catch (error) {
3334
+ const message = error instanceof Error ? error.message : String(error);
3335
+ const fallbackScore = {
3336
+ score: 0,
3337
+ hits: [],
3338
+ misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
3339
+ expectedAspectCount: 1,
3340
+ reasoning: message
3341
+ };
3342
+ scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
3343
+ evaluatorResults.push({
3344
+ name: evaluator.name ?? "unknown",
3345
+ type: evaluator.type ?? "unknown",
3346
+ score: 0,
3347
+ hits: [],
3348
+ misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
3349
+ reasoning: message
3350
+ });
3351
+ }
3352
+ }
3353
+ const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
3354
+ const hits = scored.flatMap((entry) => entry.score.hits);
3355
+ const misses = scored.flatMap((entry) => entry.score.misses);
3356
+ const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
3357
+ const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
3358
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
3359
+ const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
3360
+ const score = {
3361
+ score: aggregateScore,
3362
+ hits,
3363
+ misses,
3364
+ expectedAspectCount,
3365
+ reasoning,
3366
+ rawAspects: rawAspects.length > 0 ? rawAspects : void 0
3367
+ };
3368
+ return { score, evaluatorResults };
3369
+ }
3370
+ async function runLlmJudgeEvaluator(options) {
3371
+ const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
3372
+ const customPrompt = await resolveCustomPrompt(config);
3373
+ return evaluatorRegistry.llm_judge.evaluate({
3374
+ evalCase,
3375
+ candidate,
3376
+ target,
3377
+ provider,
3378
+ attempt,
3379
+ promptInputs,
3380
+ now,
3381
+ judgeProvider,
3382
+ systemPrompt: customPrompt,
3383
+ evaluator: config,
3384
+ judgeModel: config.model
3385
+ });
3386
+ }
3387
+ async function resolveCustomPrompt(config) {
3388
+ if (config.promptPath) {
3389
+ try {
3390
+ return await readFile4(config.promptPath, "utf8");
3391
+ } catch (error) {
3392
+ const message = error instanceof Error ? error.message : String(error);
3393
+ console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
3394
+ }
3395
+ }
3396
+ return config.prompt;
3397
+ }
3398
+ function isNonEmptyString2(value) {
3399
+ return typeof value === "string" && value.trim().length > 0;
3400
+ }
2010
3401
  function filterEvalCases(evalCases, evalId) {
2011
3402
  if (!evalId) {
2012
3403
  return evalCases;
2013
3404
  }
2014
3405
  return evalCases.filter((evalCase) => evalCase.id === evalId);
2015
3406
  }
2016
- function buildGraderRegistry(overrides, resolveJudgeProvider) {
2017
- const heuristic = overrides?.heuristic ?? new HeuristicGrader();
2018
- const llmJudge = overrides?.llm_judge ?? new QualityGrader({
3407
+ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
3408
+ const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
2019
3409
  resolveJudgeProvider: async (context) => {
2020
3410
  if (context.judgeProvider) {
2021
3411
  return context.judgeProvider;
@@ -2025,15 +3415,14 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
2025
3415
  });
2026
3416
  return {
2027
3417
  ...overrides,
2028
- heuristic,
2029
3418
  llm_judge: llmJudge
2030
3419
  };
2031
3420
  }
2032
3421
  async function dumpPrompt(directory, evalCase, promptInputs) {
2033
3422
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2034
3423
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
2035
- const filePath = path4.resolve(directory, filename);
2036
- await mkdir(path4.dirname(filePath), { recursive: true });
3424
+ const filePath = path7.resolve(directory, filename);
3425
+ await mkdir2(path7.dirname(filePath), { recursive: true });
2037
3426
  const payload = {
2038
3427
  eval_id: evalCase.id,
2039
3428
  request: promptInputs.request,
@@ -2050,7 +3439,7 @@ function sanitizeFilename(value) {
2050
3439
  return sanitized.length > 0 ? sanitized : randomUUID2();
2051
3440
  }
2052
3441
  async function invokeProvider(provider, options) {
2053
- const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
3442
+ const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
2054
3443
  const controller = new AbortController();
2055
3444
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2056
3445
  if (signal) {
@@ -2061,7 +3450,7 @@ async function invokeProvider(provider, options) {
2061
3450
  prompt: promptInputs.request,
2062
3451
  guidelines: promptInputs.guidelines,
2063
3452
  guideline_patterns: evalCase.guideline_patterns,
2064
- attachments: evalCase.file_paths,
3453
+ inputFiles: evalCase.file_paths,
2065
3454
  evalCaseId: evalCase.id,
2066
3455
  attempt,
2067
3456
  metadata: {
@@ -2129,25 +3518,20 @@ function createAgentKernel() {
2129
3518
  return { status: "stub" };
2130
3519
  }
2131
3520
  export {
2132
- GRADER_KINDS,
2133
- HeuristicGrader,
2134
- QualityGrader,
3521
+ CodeEvaluator,
3522
+ LlmJudgeEvaluator,
2135
3523
  TEST_MESSAGE_ROLES,
2136
3524
  buildDirectoryChain,
2137
3525
  buildPromptInputs,
2138
3526
  buildSearchRoots,
2139
- calculateHits,
2140
- calculateMisses,
2141
3527
  createAgentKernel,
2142
3528
  createProvider,
2143
3529
  ensureVSCodeSubagents,
2144
- extractAspects,
2145
3530
  extractCodeBlocks,
2146
3531
  fileExists,
2147
3532
  findGitRoot,
2148
3533
  getHitCount,
2149
- isErrorLike,
2150
- isGraderKind,
3534
+ isEvaluatorKind,
2151
3535
  isGuidelineFile,
2152
3536
  isJsonObject,
2153
3537
  isJsonValue,
@@ -2160,7 +3544,6 @@ export {
2160
3544
  resolveFileReference,
2161
3545
  resolveTargetDefinition,
2162
3546
  runEvalCase,
2163
- runEvaluation,
2164
- scoreCandidateResponse
3547
+ runEvaluation
2165
3548
  };
2166
3549
  //# sourceMappingURL=index.js.map