@agentv/core 0.5.1 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4,8 +4,9 @@ import {
4
4
  buildSearchRoots,
5
5
  fileExists,
6
6
  findGitRoot,
7
+ readTextFile,
7
8
  resolveFileReference
8
- } from "./chunk-NL7K4CAK.js";
9
+ } from "./chunk-OW3SHBIJ.js";
9
10
 
10
11
  // src/evaluation/types.ts
11
12
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -149,6 +150,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
149
150
  throw new Error(`Invalid test file format: ${evalFilePath}`);
150
151
  }
151
152
  const suite = parsed;
153
+ const datasetNameFromSuite = asString(suite.dataset)?.trim();
154
+ const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
155
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
152
156
  const schema = suite.$schema;
153
157
  if (schema !== SCHEMA_EVAL_V2) {
154
158
  const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
@@ -296,6 +300,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
296
300
  ];
297
301
  const testCase = {
298
302
  id,
303
+ dataset: datasetName,
299
304
  conversation_id: conversationId,
300
305
  task: userTextPrompt,
301
306
  user_segments: userSegments,
@@ -676,6 +681,9 @@ var AzureProvider = class {
676
681
  );
677
682
  return mapResponse(ensureChatResponse(response));
678
683
  }
684
+ getAxAI() {
685
+ return this.ai;
686
+ }
679
687
  };
680
688
  var AnthropicProvider = class {
681
689
  constructor(targetName, config) {
@@ -710,6 +718,9 @@ var AnthropicProvider = class {
710
718
  );
711
719
  return mapResponse(ensureChatResponse(response));
712
720
  }
721
+ getAxAI() {
722
+ return this.ai;
723
+ }
713
724
  };
714
725
  var GeminiProvider = class {
715
726
  constructor(targetName, config) {
@@ -743,6 +754,9 @@ var GeminiProvider = class {
743
754
  );
744
755
  return mapResponse(ensureChatResponse(response));
745
756
  }
757
+ getAxAI() {
758
+ return this.ai;
759
+ }
746
760
  };
747
761
 
748
762
  // src/evaluation/providers/cli.ts
@@ -955,7 +969,8 @@ function formatTimeoutSuffix(timeoutMs) {
955
969
 
956
970
  // src/evaluation/providers/codex.ts
957
971
  import { exec as execCallback, spawn } from "node:child_process";
958
- import { constants as constants2 } from "node:fs";
972
+ import { randomUUID } from "node:crypto";
973
+ import { constants as constants2, createWriteStream } from "node:fs";
959
974
  import { access as access2, copyFile, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
960
975
  import { tmpdir } from "node:os";
961
976
  import path4 from "node:path";
@@ -1062,6 +1077,59 @@ function pathToFileUri(filePath) {
1062
1077
  return `file://${normalizedPath}`;
1063
1078
  }
1064
1079
 
1080
+ // src/evaluation/providers/codex-log-tracker.ts
1081
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1082
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1083
+ function getCodexLogStore() {
1084
+ const globalObject = globalThis;
1085
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1086
+ if (existing) {
1087
+ return existing;
1088
+ }
1089
+ const created = [];
1090
+ globalObject[GLOBAL_LOGS_KEY] = created;
1091
+ return created;
1092
+ }
1093
+ function getSubscriberStore() {
1094
+ const globalObject = globalThis;
1095
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1096
+ if (existing) {
1097
+ return existing;
1098
+ }
1099
+ const created = /* @__PURE__ */ new Set();
1100
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1101
+ return created;
1102
+ }
1103
+ function notifySubscribers(entry) {
1104
+ const subscribers = Array.from(getSubscriberStore());
1105
+ for (const listener of subscribers) {
1106
+ try {
1107
+ listener(entry);
1108
+ } catch (error) {
1109
+ const message = error instanceof Error ? error.message : String(error);
1110
+ console.warn(`Codex log subscriber failed: ${message}`);
1111
+ }
1112
+ }
1113
+ }
1114
+ function recordCodexLogEntry(entry) {
1115
+ getCodexLogStore().push(entry);
1116
+ notifySubscribers(entry);
1117
+ }
1118
+ function consumeCodexLogEntries() {
1119
+ const store = getCodexLogStore();
1120
+ if (store.length === 0) {
1121
+ return [];
1122
+ }
1123
+ return store.splice(0, store.length);
1124
+ }
1125
+ function subscribeToCodexLogEntries(listener) {
1126
+ const store = getSubscriberStore();
1127
+ store.add(listener);
1128
+ return () => {
1129
+ store.delete(listener);
1130
+ };
1131
+ }
1132
+
1065
1133
  // src/evaluation/providers/codex.ts
1066
1134
  var execAsync2 = promisify2(execCallback);
1067
1135
  var WORKSPACE_PREFIX = "agentv-codex-";
@@ -1093,6 +1161,7 @@ var CodexProvider = class {
1093
1161
  collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => path4.resolve(file))
1094
1162
  );
1095
1163
  const workspaceRoot = await this.createWorkspace();
1164
+ const logger = await this.createStreamLogger(request).catch(() => void 0);
1096
1165
  try {
1097
1166
  const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
1098
1167
  inputFiles,
@@ -1107,7 +1176,7 @@ var CodexProvider = class {
1107
1176
  await writeFile(promptFile, promptContent, "utf8");
1108
1177
  const args = this.buildCodexArgs();
1109
1178
  const cwd = this.resolveCwd(workspaceRoot);
1110
- const result = await this.executeCodex(args, cwd, promptContent, request.signal);
1179
+ const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
1111
1180
  if (result.timedOut) {
1112
1181
  throw new Error(
1113
1182
  `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
@@ -1131,10 +1200,12 @@ var CodexProvider = class {
1131
1200
  executable: this.resolvedExecutable ?? this.config.executable,
1132
1201
  promptFile,
1133
1202
  workspace: workspaceRoot,
1134
- inputFiles: mirroredInputFiles
1203
+ inputFiles: mirroredInputFiles,
1204
+ logFile: logger?.filePath
1135
1205
  }
1136
1206
  };
1137
1207
  } finally {
1208
+ await logger?.close();
1138
1209
  await this.cleanupWorkspace(workspaceRoot);
1139
1210
  }
1140
1211
  }
@@ -1161,7 +1232,7 @@ var CodexProvider = class {
1161
1232
  args.push("-");
1162
1233
  return args;
1163
1234
  }
1164
- async executeCodex(args, cwd, promptContent, signal) {
1235
+ async executeCodex(args, cwd, promptContent, signal, logger) {
1165
1236
  try {
1166
1237
  return await this.runCodex({
1167
1238
  executable: this.resolvedExecutable ?? this.config.executable,
@@ -1170,7 +1241,9 @@ var CodexProvider = class {
1170
1241
  prompt: promptContent,
1171
1242
  timeoutMs: this.config.timeoutMs,
1172
1243
  env: process.env,
1173
- signal
1244
+ signal,
1245
+ onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
1246
+ onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
1174
1247
  });
1175
1248
  } catch (error) {
1176
1249
  const err = error;
@@ -1222,7 +1295,240 @@ var CodexProvider = class {
1222
1295
  } catch {
1223
1296
  }
1224
1297
  }
1298
+ resolveLogDirectory() {
1299
+ const disabled = isCodexLogStreamingDisabled();
1300
+ if (disabled) {
1301
+ return void 0;
1302
+ }
1303
+ if (this.config.logDir) {
1304
+ return path4.resolve(this.config.logDir);
1305
+ }
1306
+ return path4.join(process.cwd(), ".agentv", "logs", "codex");
1307
+ }
1308
+ async createStreamLogger(request) {
1309
+ const logDir = this.resolveLogDirectory();
1310
+ if (!logDir) {
1311
+ return void 0;
1312
+ }
1313
+ try {
1314
+ await mkdir(logDir, { recursive: true });
1315
+ } catch (error) {
1316
+ const message = error instanceof Error ? error.message : String(error);
1317
+ console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
1318
+ return void 0;
1319
+ }
1320
+ const filePath = path4.join(logDir, buildLogFilename(request, this.targetName));
1321
+ try {
1322
+ const logger = await CodexStreamLogger.create({
1323
+ filePath,
1324
+ targetName: this.targetName,
1325
+ evalCaseId: request.evalCaseId,
1326
+ attempt: request.attempt,
1327
+ format: this.config.logFormat ?? "summary"
1328
+ });
1329
+ recordCodexLogEntry({
1330
+ filePath,
1331
+ targetName: this.targetName,
1332
+ evalCaseId: request.evalCaseId,
1333
+ attempt: request.attempt
1334
+ });
1335
+ return logger;
1336
+ } catch (error) {
1337
+ const message = error instanceof Error ? error.message : String(error);
1338
+ console.warn(`Skipping Codex stream logging for ${filePath}: ${message}`);
1339
+ return void 0;
1340
+ }
1341
+ }
1225
1342
  };
1343
+ var CodexStreamLogger = class _CodexStreamLogger {
1344
+ filePath;
1345
+ stream;
1346
+ startedAt = Date.now();
1347
+ stdoutBuffer = "";
1348
+ stderrBuffer = "";
1349
+ format;
1350
+ constructor(filePath, format) {
1351
+ this.filePath = filePath;
1352
+ this.format = format;
1353
+ this.stream = createWriteStream(filePath, { flags: "a" });
1354
+ }
1355
+ static async create(options) {
1356
+ const logger = new _CodexStreamLogger(options.filePath, options.format);
1357
+ const header = [
1358
+ "# Codex CLI stream log",
1359
+ `# target: ${options.targetName}`,
1360
+ options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
1361
+ options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
1362
+ `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
1363
+ ""
1364
+ ].filter((line) => Boolean(line));
1365
+ logger.writeLines(header);
1366
+ return logger;
1367
+ }
1368
+ handleStdoutChunk(chunk) {
1369
+ this.stdoutBuffer += chunk;
1370
+ this.flushBuffer("stdout");
1371
+ }
1372
+ handleStderrChunk(chunk) {
1373
+ this.stderrBuffer += chunk;
1374
+ this.flushBuffer("stderr");
1375
+ }
1376
+ async close() {
1377
+ this.flushBuffer("stdout");
1378
+ this.flushBuffer("stderr");
1379
+ this.flushRemainder();
1380
+ await new Promise((resolve, reject) => {
1381
+ this.stream.once("error", reject);
1382
+ this.stream.end(() => resolve());
1383
+ });
1384
+ }
1385
+ writeLines(lines) {
1386
+ for (const line of lines) {
1387
+ this.stream.write(`${line}
1388
+ `);
1389
+ }
1390
+ }
1391
+ flushBuffer(source) {
1392
+ const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
1393
+ const lines = buffer.split(/\r?\n/);
1394
+ const remainder = lines.pop() ?? "";
1395
+ if (source === "stdout") {
1396
+ this.stdoutBuffer = remainder;
1397
+ } else {
1398
+ this.stderrBuffer = remainder;
1399
+ }
1400
+ for (const line of lines) {
1401
+ const formatted = this.formatLine(line, source);
1402
+ if (formatted) {
1403
+ this.stream.write(formatted);
1404
+ this.stream.write("\n");
1405
+ }
1406
+ }
1407
+ }
1408
+ formatLine(rawLine, source) {
1409
+ const trimmed = rawLine.trim();
1410
+ if (trimmed.length === 0) {
1411
+ return void 0;
1412
+ }
1413
+ const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
1414
+ return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
1415
+ }
1416
+ flushRemainder() {
1417
+ const stdoutRemainder = this.stdoutBuffer.trim();
1418
+ if (stdoutRemainder.length > 0) {
1419
+ const formatted = this.formatLine(stdoutRemainder, "stdout");
1420
+ if (formatted) {
1421
+ this.stream.write(formatted);
1422
+ this.stream.write("\n");
1423
+ }
1424
+ }
1425
+ const stderrRemainder = this.stderrBuffer.trim();
1426
+ if (stderrRemainder.length > 0) {
1427
+ const formatted = this.formatLine(stderrRemainder, "stderr");
1428
+ if (formatted) {
1429
+ this.stream.write(formatted);
1430
+ this.stream.write("\n");
1431
+ }
1432
+ }
1433
+ this.stdoutBuffer = "";
1434
+ this.stderrBuffer = "";
1435
+ }
1436
+ };
1437
+ function isCodexLogStreamingDisabled() {
1438
+ const envValue = process.env.AGENTV_CODEX_STREAM_LOGS;
1439
+ if (!envValue) {
1440
+ return false;
1441
+ }
1442
+ const normalized = envValue.trim().toLowerCase();
1443
+ return normalized === "false" || normalized === "0" || normalized === "off";
1444
+ }
1445
+ function buildLogFilename(request, targetName) {
1446
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1447
+ const evalId = sanitizeForFilename(request.evalCaseId ?? "codex");
1448
+ const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
1449
+ const target = sanitizeForFilename(targetName);
1450
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`;
1451
+ }
1452
+ function sanitizeForFilename(value) {
1453
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
1454
+ return sanitized.length > 0 ? sanitized : "codex";
1455
+ }
1456
+ function formatElapsed(startedAt) {
1457
+ const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
1458
+ const hours = Math.floor(elapsedSeconds / 3600);
1459
+ const minutes = Math.floor(elapsedSeconds % 3600 / 60);
1460
+ const seconds = elapsedSeconds % 60;
1461
+ if (hours > 0) {
1462
+ return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
1463
+ }
1464
+ return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
1465
+ }
1466
+ function formatCodexLogMessage(rawLine, source) {
1467
+ const parsed = tryParseJsonValue(rawLine);
1468
+ if (parsed) {
1469
+ const summary = summarizeCodexEvent(parsed);
1470
+ if (summary) {
1471
+ return summary;
1472
+ }
1473
+ }
1474
+ if (source === "stderr") {
1475
+ return `stderr: ${rawLine}`;
1476
+ }
1477
+ return rawLine;
1478
+ }
1479
+ function formatCodexJsonLog(rawLine) {
1480
+ const parsed = tryParseJsonValue(rawLine);
1481
+ if (!parsed) {
1482
+ return rawLine;
1483
+ }
1484
+ try {
1485
+ return JSON.stringify(parsed, null, 2);
1486
+ } catch {
1487
+ return rawLine;
1488
+ }
1489
+ }
1490
+ function summarizeCodexEvent(event) {
1491
+ if (!event || typeof event !== "object") {
1492
+ return void 0;
1493
+ }
1494
+ const record = event;
1495
+ const type = typeof record.type === "string" ? record.type : void 0;
1496
+ let message = extractFromEvent(event) ?? extractFromItem(record.item) ?? flattenContent(record.output ?? record.content);
1497
+ if (!message && type === JSONL_TYPE_ITEM_COMPLETED) {
1498
+ const item = record.item;
1499
+ if (item && typeof item === "object") {
1500
+ const candidate = flattenContent(
1501
+ item.text ?? item.content ?? item.output
1502
+ );
1503
+ if (candidate) {
1504
+ message = candidate;
1505
+ }
1506
+ }
1507
+ }
1508
+ if (!message) {
1509
+ const itemType = typeof record.item?.type === "string" ? record.item.type : void 0;
1510
+ if (type && itemType) {
1511
+ return `${type}:${itemType}`;
1512
+ }
1513
+ if (type) {
1514
+ return type;
1515
+ }
1516
+ }
1517
+ if (type && message) {
1518
+ return `${type}: ${message}`;
1519
+ }
1520
+ if (message) {
1521
+ return message;
1522
+ }
1523
+ return type;
1524
+ }
1525
+ function tryParseJsonValue(rawLine) {
1526
+ try {
1527
+ return JSON.parse(rawLine);
1528
+ } catch {
1529
+ return void 0;
1530
+ }
1531
+ }
1226
1532
  async function locateExecutable(candidate) {
1227
1533
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
1228
1534
  if (includesPathSeparator) {
@@ -1492,10 +1798,12 @@ async function defaultCodexRunner(options) {
1492
1798
  child.stdout.setEncoding("utf8");
1493
1799
  child.stdout.on("data", (chunk) => {
1494
1800
  stdout += chunk;
1801
+ options.onStdoutChunk?.(chunk);
1495
1802
  });
1496
1803
  child.stderr.setEncoding("utf8");
1497
1804
  child.stderr.on("data", (chunk) => {
1498
1805
  stderr += chunk;
1806
+ options.onStderrChunk?.(chunk);
1499
1807
  });
1500
1808
  child.stdin.end(options.prompt);
1501
1809
  const cleanup = () => {
@@ -1740,6 +2048,8 @@ function resolveCodexConfig(target, env) {
1740
2048
  const argsSource = settings.args ?? settings.arguments;
1741
2049
  const cwdSource = settings.cwd;
1742
2050
  const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
2051
+ const logDirSource = settings.log_dir ?? settings.logDir ?? settings.log_directory ?? settings.logDirectory;
2052
+ const logFormatSource = settings.log_format ?? settings.logFormat ?? settings.log_output_format ?? settings.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
1743
2053
  const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
1744
2054
  allowLiteral: true,
1745
2055
  optionalEnv: true
@@ -1750,13 +2060,33 @@ function resolveCodexConfig(target, env) {
1750
2060
  optionalEnv: true
1751
2061
  });
1752
2062
  const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
2063
+ const logDir = resolveOptionalString(logDirSource, env, `${target.name} codex log directory`, {
2064
+ allowLiteral: true,
2065
+ optionalEnv: true
2066
+ });
2067
+ const logFormat = normalizeCodexLogFormat(logFormatSource);
1753
2068
  return {
1754
2069
  executable,
1755
2070
  args,
1756
2071
  cwd,
1757
- timeoutMs
2072
+ timeoutMs,
2073
+ logDir,
2074
+ logFormat
1758
2075
  };
1759
2076
  }
2077
+ function normalizeCodexLogFormat(value) {
2078
+ if (value === void 0 || value === null) {
2079
+ return void 0;
2080
+ }
2081
+ if (typeof value !== "string") {
2082
+ throw new Error("codex log format must be 'summary' or 'json'");
2083
+ }
2084
+ const normalized = value.trim().toLowerCase();
2085
+ if (normalized === "json" || normalized === "summary") {
2086
+ return normalized;
2087
+ }
2088
+ throw new Error("codex log format must be 'summary' or 'json'");
2089
+ }
1760
2090
  function resolveMockConfig(target) {
1761
2091
  const settings = target.settings ?? {};
1762
2092
  const response = typeof settings.response === "string" ? settings.response : void 0;
@@ -2386,7 +2716,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
2386
2716
  }
2387
2717
 
2388
2718
  // src/evaluation/evaluators.ts
2389
- import { randomUUID } from "node:crypto";
2719
+ import { ax, f } from "@ax-llm/ax";
2720
+ import { randomUUID as randomUUID2 } from "node:crypto";
2721
+ var LLM_JUDGE_SIGNATURE = f().input(
2722
+ "evaluationContext",
2723
+ f.object(
2724
+ {
2725
+ expectedOutcome: f.string("The expected outcome for the original task"),
2726
+ request: f.string("The original task request"),
2727
+ referenceAnswer: f.string("The gold standard reference answer"),
2728
+ generatedAnswer: f.string("The answer to evaluate"),
2729
+ guidelines: f.string("Additional evaluation guidelines or instructions").optional()
2730
+ },
2731
+ "Complete evaluation context for the judge"
2732
+ )
2733
+ ).output(
2734
+ "evaluation",
2735
+ f.object({
2736
+ score: f.number("Score between 0.0 and 1.0").min(0).max(1),
2737
+ hits: f.string("Brief specific achievement").array(),
2738
+ misses: f.string("Brief specific failure or omission").array(),
2739
+ reasoning: f.string("Concise explanation for the score").max(500)
2740
+ })
2741
+ ).build();
2742
+ var LLM_JUDGE = ax(LLM_JUDGE_SIGNATURE);
2390
2743
  var LlmJudgeEvaluator = class {
2391
2744
  kind = "llm_judge";
2392
2745
  resolveJudgeProvider;
@@ -2404,6 +2757,44 @@ var LlmJudgeEvaluator = class {
2404
2757
  if (!judgeProvider) {
2405
2758
  throw new Error("No judge provider available for LLM grading");
2406
2759
  }
2760
+ if (providerSupportsAx(judgeProvider)) {
2761
+ return this.evaluateWithAx(context, judgeProvider);
2762
+ }
2763
+ return this.evaluateWithPrompt(context, judgeProvider);
2764
+ }
2765
+ async evaluateWithAx(context, judgeProvider) {
2766
+ const ai = judgeProvider.getAxAI();
2767
+ const guidelines = context.promptInputs.guidelines?.trim();
2768
+ const evaluationContext = {
2769
+ expectedOutcome: context.evalCase.outcome.trim(),
2770
+ request: context.evalCase.task.trim(),
2771
+ referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
2772
+ generatedAnswer: context.candidate.trim(),
2773
+ ...guidelines ? { guidelines } : {}
2774
+ };
2775
+ const options = this.buildJudgeForwardOptions(context);
2776
+ const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
2777
+ const evaluation = result.evaluation;
2778
+ const expectedAspectCount = Math.max(
2779
+ evaluation.hits.length + evaluation.misses.length,
2780
+ 1
2781
+ );
2782
+ return {
2783
+ score: evaluation.score,
2784
+ hits: evaluation.hits,
2785
+ misses: evaluation.misses,
2786
+ expectedAspectCount,
2787
+ reasoning: evaluation.reasoning,
2788
+ evaluatorRawRequest: {
2789
+ id: randomUUID2(),
2790
+ provider: judgeProvider.id,
2791
+ target: context.target.name,
2792
+ method: "ax-structured-output",
2793
+ signature: LLM_JUDGE_SIGNATURE.toString()
2794
+ }
2795
+ };
2796
+ }
2797
+ async evaluateWithPrompt(context, judgeProvider) {
2407
2798
  const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2408
2799
  const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2409
2800
  const metadata = {
@@ -2423,8 +2814,9 @@ var LlmJudgeEvaluator = class {
2423
2814
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
2424
2815
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
2425
2816
  const reasoning = parsed.reasoning ?? response.reasoning;
2817
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
2426
2818
  const evaluatorRawRequest = {
2427
- id: randomUUID(),
2819
+ id: randomUUID2(),
2428
2820
  provider: judgeProvider.id,
2429
2821
  prompt,
2430
2822
  target: context.target.name,
@@ -2435,12 +2827,34 @@ var LlmJudgeEvaluator = class {
2435
2827
  score,
2436
2828
  hits,
2437
2829
  misses,
2438
- expectedAspectCount: hits.length + misses.length || 1,
2830
+ expectedAspectCount,
2439
2831
  reasoning,
2440
2832
  evaluatorRawRequest
2441
2833
  };
2442
2834
  }
2835
+ buildJudgeForwardOptions(context) {
2836
+ const modelConfig = this.buildJudgeModelConfig();
2837
+ if (modelConfig === void 0 && context.judgeModel === void 0) {
2838
+ return void 0;
2839
+ }
2840
+ return {
2841
+ ...context.judgeModel ? { model: context.judgeModel } : {},
2842
+ ...modelConfig ? { modelConfig } : {}
2843
+ };
2844
+ }
2845
+ buildJudgeModelConfig() {
2846
+ if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
2847
+ return void 0;
2848
+ }
2849
+ return {
2850
+ ...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
2851
+ ...this.temperature !== void 0 ? { temperature: this.temperature } : {}
2852
+ };
2853
+ }
2443
2854
  };
2855
+ function providerSupportsAx(provider) {
2856
+ return typeof provider.getAxAI === "function";
2857
+ }
2444
2858
  var QUALITY_SYSTEM_PROMPT = [
2445
2859
  "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
2446
2860
  "",
@@ -2663,8 +3077,8 @@ function parseJsonSafe(payload) {
2663
3077
  }
2664
3078
 
2665
3079
  // src/evaluation/orchestrator.ts
2666
- import { createHash, randomUUID as randomUUID2 } from "node:crypto";
2667
- import { mkdir as mkdir2, readFile as readFile4, writeFile as writeFile2 } from "node:fs/promises";
3080
+ import { createHash, randomUUID as randomUUID3 } from "node:crypto";
3081
+ import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
2668
3082
  import path7 from "node:path";
2669
3083
 
2670
3084
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
@@ -3211,6 +3625,7 @@ async function evaluateCandidate(options) {
3211
3625
  };
3212
3626
  return {
3213
3627
  eval_id: evalCase.id,
3628
+ dataset: evalCase.dataset,
3214
3629
  conversation_id: evalCase.conversation_id,
3215
3630
  score: score.score,
3216
3631
  hits: score.hits,
@@ -3387,7 +3802,7 @@ async function runLlmJudgeEvaluator(options) {
3387
3802
  async function resolveCustomPrompt(config) {
3388
3803
  if (config.promptPath) {
3389
3804
  try {
3390
- return await readFile4(config.promptPath, "utf8");
3805
+ return await readTextFile(config.promptPath);
3391
3806
  } catch (error) {
3392
3807
  const message = error instanceof Error ? error.message : String(error);
3393
3808
  console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
@@ -3436,7 +3851,7 @@ function sanitizeFilename(value) {
3436
3851
  return "prompt";
3437
3852
  }
3438
3853
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3439
- return sanitized.length > 0 ? sanitized : randomUUID2();
3854
+ return sanitized.length > 0 ? sanitized : randomUUID3();
3440
3855
  }
3441
3856
  async function invokeProvider(provider, options) {
3442
3857
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -3475,6 +3890,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
3475
3890
  };
3476
3891
  return {
3477
3892
  eval_id: evalCase.id,
3893
+ dataset: evalCase.dataset,
3478
3894
  conversation_id: evalCase.conversation_id,
3479
3895
  score: 0,
3480
3896
  hits: [],
@@ -3524,6 +3940,7 @@ export {
3524
3940
  buildDirectoryChain,
3525
3941
  buildPromptInputs,
3526
3942
  buildSearchRoots,
3943
+ consumeCodexLogEntries,
3527
3944
  createAgentKernel,
3528
3945
  createProvider,
3529
3946
  ensureVSCodeSubagents,
@@ -3540,10 +3957,12 @@ export {
3540
3957
  listTargetNames,
3541
3958
  loadEvalCases,
3542
3959
  readTargetDefinitions,
3960
+ readTextFile,
3543
3961
  resolveAndCreateProvider,
3544
3962
  resolveFileReference,
3545
3963
  resolveTargetDefinition,
3546
3964
  runEvalCase,
3547
- runEvaluation
3965
+ runEvaluation,
3966
+ subscribeToCodexLogEntries
3548
3967
  };
3549
3968
  //# sourceMappingURL=index.js.map