@agentv/core 0.5.1 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -36,6 +36,7 @@ __export(index_exports, {
36
36
  buildDirectoryChain: () => buildDirectoryChain,
37
37
  buildPromptInputs: () => buildPromptInputs,
38
38
  buildSearchRoots: () => buildSearchRoots,
39
+ consumeCodexLogEntries: () => consumeCodexLogEntries,
39
40
  createAgentKernel: () => createAgentKernel,
40
41
  createProvider: () => createProvider,
41
42
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
@@ -52,11 +53,13 @@ __export(index_exports, {
52
53
  listTargetNames: () => listTargetNames,
53
54
  loadEvalCases: () => loadEvalCases,
54
55
  readTargetDefinitions: () => readTargetDefinitions,
56
+ readTextFile: () => readTextFile,
55
57
  resolveAndCreateProvider: () => resolveAndCreateProvider,
56
58
  resolveFileReference: () => resolveFileReference,
57
59
  resolveTargetDefinition: () => resolveTargetDefinition,
58
60
  runEvalCase: () => runEvalCase,
59
- runEvaluation: () => runEvaluation
61
+ runEvaluation: () => runEvaluation,
62
+ subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
60
63
  });
61
64
  module.exports = __toCommonJS(index_exports);
62
65
 
@@ -130,6 +133,10 @@ async function fileExists(filePath) {
130
133
  return false;
131
134
  }
132
135
  }
136
+ async function readTextFile(filePath) {
137
+ const content = await (0, import_promises.readFile)(filePath, "utf8");
138
+ return content.replace(/\r\n/g, "\n");
139
+ }
133
140
  async function findGitRoot(startPath) {
134
141
  let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
135
142
  const root = import_node_path.default.parse(currentDir).root;
@@ -308,6 +315,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
308
315
  throw new Error(`Invalid test file format: ${evalFilePath}`);
309
316
  }
310
317
  const suite = parsed;
318
+ const datasetNameFromSuite = asString(suite.dataset)?.trim();
319
+ const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
320
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
311
321
  const schema = suite.$schema;
312
322
  if (schema !== SCHEMA_EVAL_V2) {
313
323
  const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
@@ -455,6 +465,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
455
465
  ];
456
466
  const testCase = {
457
467
  id,
468
+ dataset: datasetName,
458
469
  conversation_id: conversationId,
459
470
  task: userTextPrompt,
460
471
  user_segments: userSegments,
@@ -835,6 +846,9 @@ var AzureProvider = class {
835
846
  );
836
847
  return mapResponse(ensureChatResponse(response));
837
848
  }
849
+ getAxAI() {
850
+ return this.ai;
851
+ }
838
852
  };
839
853
  var AnthropicProvider = class {
840
854
  constructor(targetName, config) {
@@ -869,6 +883,9 @@ var AnthropicProvider = class {
869
883
  );
870
884
  return mapResponse(ensureChatResponse(response));
871
885
  }
886
+ getAxAI() {
887
+ return this.ai;
888
+ }
872
889
  };
873
890
  var GeminiProvider = class {
874
891
  constructor(targetName, config) {
@@ -902,6 +919,9 @@ var GeminiProvider = class {
902
919
  );
903
920
  return mapResponse(ensureChatResponse(response));
904
921
  }
922
+ getAxAI() {
923
+ return this.ai;
924
+ }
905
925
  };
906
926
 
907
927
  // src/evaluation/providers/cli.ts
@@ -1114,6 +1134,7 @@ function formatTimeoutSuffix(timeoutMs) {
1114
1134
 
1115
1135
  // src/evaluation/providers/codex.ts
1116
1136
  var import_node_child_process2 = require("child_process");
1137
+ var import_node_crypto = require("crypto");
1117
1138
  var import_node_fs3 = require("fs");
1118
1139
  var import_promises3 = require("fs/promises");
1119
1140
  var import_node_os = require("os");
@@ -1221,6 +1242,59 @@ function pathToFileUri(filePath) {
1221
1242
  return `file://${normalizedPath}`;
1222
1243
  }
1223
1244
 
1245
+ // src/evaluation/providers/codex-log-tracker.ts
1246
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1247
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1248
+ function getCodexLogStore() {
1249
+ const globalObject = globalThis;
1250
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1251
+ if (existing) {
1252
+ return existing;
1253
+ }
1254
+ const created = [];
1255
+ globalObject[GLOBAL_LOGS_KEY] = created;
1256
+ return created;
1257
+ }
1258
+ function getSubscriberStore() {
1259
+ const globalObject = globalThis;
1260
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1261
+ if (existing) {
1262
+ return existing;
1263
+ }
1264
+ const created = /* @__PURE__ */ new Set();
1265
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1266
+ return created;
1267
+ }
1268
+ function notifySubscribers(entry) {
1269
+ const subscribers = Array.from(getSubscriberStore());
1270
+ for (const listener of subscribers) {
1271
+ try {
1272
+ listener(entry);
1273
+ } catch (error) {
1274
+ const message = error instanceof Error ? error.message : String(error);
1275
+ console.warn(`Codex log subscriber failed: ${message}`);
1276
+ }
1277
+ }
1278
+ }
1279
+ function recordCodexLogEntry(entry) {
1280
+ getCodexLogStore().push(entry);
1281
+ notifySubscribers(entry);
1282
+ }
1283
+ function consumeCodexLogEntries() {
1284
+ const store = getCodexLogStore();
1285
+ if (store.length === 0) {
1286
+ return [];
1287
+ }
1288
+ return store.splice(0, store.length);
1289
+ }
1290
+ function subscribeToCodexLogEntries(listener) {
1291
+ const store = getSubscriberStore();
1292
+ store.add(listener);
1293
+ return () => {
1294
+ store.delete(listener);
1295
+ };
1296
+ }
1297
+
1224
1298
  // src/evaluation/providers/codex.ts
1225
1299
  var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
1226
1300
  var WORKSPACE_PREFIX = "agentv-codex-";
@@ -1252,6 +1326,7 @@ var CodexProvider = class {
1252
1326
  collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => import_node_path5.default.resolve(file))
1253
1327
  );
1254
1328
  const workspaceRoot = await this.createWorkspace();
1329
+ const logger = await this.createStreamLogger(request).catch(() => void 0);
1255
1330
  try {
1256
1331
  const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
1257
1332
  inputFiles,
@@ -1266,7 +1341,7 @@ var CodexProvider = class {
1266
1341
  await (0, import_promises3.writeFile)(promptFile, promptContent, "utf8");
1267
1342
  const args = this.buildCodexArgs();
1268
1343
  const cwd = this.resolveCwd(workspaceRoot);
1269
- const result = await this.executeCodex(args, cwd, promptContent, request.signal);
1344
+ const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
1270
1345
  if (result.timedOut) {
1271
1346
  throw new Error(
1272
1347
  `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
@@ -1290,10 +1365,12 @@ var CodexProvider = class {
1290
1365
  executable: this.resolvedExecutable ?? this.config.executable,
1291
1366
  promptFile,
1292
1367
  workspace: workspaceRoot,
1293
- inputFiles: mirroredInputFiles
1368
+ inputFiles: mirroredInputFiles,
1369
+ logFile: logger?.filePath
1294
1370
  }
1295
1371
  };
1296
1372
  } finally {
1373
+ await logger?.close();
1297
1374
  await this.cleanupWorkspace(workspaceRoot);
1298
1375
  }
1299
1376
  }
@@ -1320,7 +1397,7 @@ var CodexProvider = class {
1320
1397
  args.push("-");
1321
1398
  return args;
1322
1399
  }
1323
- async executeCodex(args, cwd, promptContent, signal) {
1400
+ async executeCodex(args, cwd, promptContent, signal, logger) {
1324
1401
  try {
1325
1402
  return await this.runCodex({
1326
1403
  executable: this.resolvedExecutable ?? this.config.executable,
@@ -1329,7 +1406,9 @@ var CodexProvider = class {
1329
1406
  prompt: promptContent,
1330
1407
  timeoutMs: this.config.timeoutMs,
1331
1408
  env: process.env,
1332
- signal
1409
+ signal,
1410
+ onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
1411
+ onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
1333
1412
  });
1334
1413
  } catch (error) {
1335
1414
  const err = error;
@@ -1381,7 +1460,240 @@ var CodexProvider = class {
1381
1460
  } catch {
1382
1461
  }
1383
1462
  }
1463
+ resolveLogDirectory() {
1464
+ const disabled = isCodexLogStreamingDisabled();
1465
+ if (disabled) {
1466
+ return void 0;
1467
+ }
1468
+ if (this.config.logDir) {
1469
+ return import_node_path5.default.resolve(this.config.logDir);
1470
+ }
1471
+ return import_node_path5.default.join(process.cwd(), ".agentv", "logs", "codex");
1472
+ }
1473
+ async createStreamLogger(request) {
1474
+ const logDir = this.resolveLogDirectory();
1475
+ if (!logDir) {
1476
+ return void 0;
1477
+ }
1478
+ try {
1479
+ await (0, import_promises3.mkdir)(logDir, { recursive: true });
1480
+ } catch (error) {
1481
+ const message = error instanceof Error ? error.message : String(error);
1482
+ console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
1483
+ return void 0;
1484
+ }
1485
+ const filePath = import_node_path5.default.join(logDir, buildLogFilename(request, this.targetName));
1486
+ try {
1487
+ const logger = await CodexStreamLogger.create({
1488
+ filePath,
1489
+ targetName: this.targetName,
1490
+ evalCaseId: request.evalCaseId,
1491
+ attempt: request.attempt,
1492
+ format: this.config.logFormat ?? "summary"
1493
+ });
1494
+ recordCodexLogEntry({
1495
+ filePath,
1496
+ targetName: this.targetName,
1497
+ evalCaseId: request.evalCaseId,
1498
+ attempt: request.attempt
1499
+ });
1500
+ return logger;
1501
+ } catch (error) {
1502
+ const message = error instanceof Error ? error.message : String(error);
1503
+ console.warn(`Skipping Codex stream logging for ${filePath}: ${message}`);
1504
+ return void 0;
1505
+ }
1506
+ }
1507
+ };
1508
+ var CodexStreamLogger = class _CodexStreamLogger {
1509
+ filePath;
1510
+ stream;
1511
+ startedAt = Date.now();
1512
+ stdoutBuffer = "";
1513
+ stderrBuffer = "";
1514
+ format;
1515
+ constructor(filePath, format) {
1516
+ this.filePath = filePath;
1517
+ this.format = format;
1518
+ this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
1519
+ }
1520
+ static async create(options) {
1521
+ const logger = new _CodexStreamLogger(options.filePath, options.format);
1522
+ const header = [
1523
+ "# Codex CLI stream log",
1524
+ `# target: ${options.targetName}`,
1525
+ options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
1526
+ options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
1527
+ `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
1528
+ ""
1529
+ ].filter((line) => Boolean(line));
1530
+ logger.writeLines(header);
1531
+ return logger;
1532
+ }
1533
+ handleStdoutChunk(chunk) {
1534
+ this.stdoutBuffer += chunk;
1535
+ this.flushBuffer("stdout");
1536
+ }
1537
+ handleStderrChunk(chunk) {
1538
+ this.stderrBuffer += chunk;
1539
+ this.flushBuffer("stderr");
1540
+ }
1541
+ async close() {
1542
+ this.flushBuffer("stdout");
1543
+ this.flushBuffer("stderr");
1544
+ this.flushRemainder();
1545
+ await new Promise((resolve, reject) => {
1546
+ this.stream.once("error", reject);
1547
+ this.stream.end(() => resolve());
1548
+ });
1549
+ }
1550
+ writeLines(lines) {
1551
+ for (const line of lines) {
1552
+ this.stream.write(`${line}
1553
+ `);
1554
+ }
1555
+ }
1556
+ flushBuffer(source) {
1557
+ const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
1558
+ const lines = buffer.split(/\r?\n/);
1559
+ const remainder = lines.pop() ?? "";
1560
+ if (source === "stdout") {
1561
+ this.stdoutBuffer = remainder;
1562
+ } else {
1563
+ this.stderrBuffer = remainder;
1564
+ }
1565
+ for (const line of lines) {
1566
+ const formatted = this.formatLine(line, source);
1567
+ if (formatted) {
1568
+ this.stream.write(formatted);
1569
+ this.stream.write("\n");
1570
+ }
1571
+ }
1572
+ }
1573
+ formatLine(rawLine, source) {
1574
+ const trimmed = rawLine.trim();
1575
+ if (trimmed.length === 0) {
1576
+ return void 0;
1577
+ }
1578
+ const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
1579
+ return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
1580
+ }
1581
+ flushRemainder() {
1582
+ const stdoutRemainder = this.stdoutBuffer.trim();
1583
+ if (stdoutRemainder.length > 0) {
1584
+ const formatted = this.formatLine(stdoutRemainder, "stdout");
1585
+ if (formatted) {
1586
+ this.stream.write(formatted);
1587
+ this.stream.write("\n");
1588
+ }
1589
+ }
1590
+ const stderrRemainder = this.stderrBuffer.trim();
1591
+ if (stderrRemainder.length > 0) {
1592
+ const formatted = this.formatLine(stderrRemainder, "stderr");
1593
+ if (formatted) {
1594
+ this.stream.write(formatted);
1595
+ this.stream.write("\n");
1596
+ }
1597
+ }
1598
+ this.stdoutBuffer = "";
1599
+ this.stderrBuffer = "";
1600
+ }
1384
1601
  };
1602
+ function isCodexLogStreamingDisabled() {
1603
+ const envValue = process.env.AGENTV_CODEX_STREAM_LOGS;
1604
+ if (!envValue) {
1605
+ return false;
1606
+ }
1607
+ const normalized = envValue.trim().toLowerCase();
1608
+ return normalized === "false" || normalized === "0" || normalized === "off";
1609
+ }
1610
+ function buildLogFilename(request, targetName) {
1611
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1612
+ const evalId = sanitizeForFilename(request.evalCaseId ?? "codex");
1613
+ const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
1614
+ const target = sanitizeForFilename(targetName);
1615
+ return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
1616
+ }
1617
+ function sanitizeForFilename(value) {
1618
+ const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
1619
+ return sanitized.length > 0 ? sanitized : "codex";
1620
+ }
1621
+ function formatElapsed(startedAt) {
1622
+ const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
1623
+ const hours = Math.floor(elapsedSeconds / 3600);
1624
+ const minutes = Math.floor(elapsedSeconds % 3600 / 60);
1625
+ const seconds = elapsedSeconds % 60;
1626
+ if (hours > 0) {
1627
+ return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
1628
+ }
1629
+ return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
1630
+ }
1631
+ function formatCodexLogMessage(rawLine, source) {
1632
+ const parsed = tryParseJsonValue(rawLine);
1633
+ if (parsed) {
1634
+ const summary = summarizeCodexEvent(parsed);
1635
+ if (summary) {
1636
+ return summary;
1637
+ }
1638
+ }
1639
+ if (source === "stderr") {
1640
+ return `stderr: ${rawLine}`;
1641
+ }
1642
+ return rawLine;
1643
+ }
1644
+ function formatCodexJsonLog(rawLine) {
1645
+ const parsed = tryParseJsonValue(rawLine);
1646
+ if (!parsed) {
1647
+ return rawLine;
1648
+ }
1649
+ try {
1650
+ return JSON.stringify(parsed, null, 2);
1651
+ } catch {
1652
+ return rawLine;
1653
+ }
1654
+ }
1655
+ function summarizeCodexEvent(event) {
1656
+ if (!event || typeof event !== "object") {
1657
+ return void 0;
1658
+ }
1659
+ const record = event;
1660
+ const type = typeof record.type === "string" ? record.type : void 0;
1661
+ let message = extractFromEvent(event) ?? extractFromItem(record.item) ?? flattenContent(record.output ?? record.content);
1662
+ if (!message && type === JSONL_TYPE_ITEM_COMPLETED) {
1663
+ const item = record.item;
1664
+ if (item && typeof item === "object") {
1665
+ const candidate = flattenContent(
1666
+ item.text ?? item.content ?? item.output
1667
+ );
1668
+ if (candidate) {
1669
+ message = candidate;
1670
+ }
1671
+ }
1672
+ }
1673
+ if (!message) {
1674
+ const itemType = typeof record.item?.type === "string" ? record.item.type : void 0;
1675
+ if (type && itemType) {
1676
+ return `${type}:${itemType}`;
1677
+ }
1678
+ if (type) {
1679
+ return type;
1680
+ }
1681
+ }
1682
+ if (type && message) {
1683
+ return `${type}: ${message}`;
1684
+ }
1685
+ if (message) {
1686
+ return message;
1687
+ }
1688
+ return type;
1689
+ }
1690
+ function tryParseJsonValue(rawLine) {
1691
+ try {
1692
+ return JSON.parse(rawLine);
1693
+ } catch {
1694
+ return void 0;
1695
+ }
1696
+ }
1385
1697
  async function locateExecutable(candidate) {
1386
1698
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
1387
1699
  if (includesPathSeparator) {
@@ -1651,10 +1963,12 @@ async function defaultCodexRunner(options) {
1651
1963
  child.stdout.setEncoding("utf8");
1652
1964
  child.stdout.on("data", (chunk) => {
1653
1965
  stdout += chunk;
1966
+ options.onStdoutChunk?.(chunk);
1654
1967
  });
1655
1968
  child.stderr.setEncoding("utf8");
1656
1969
  child.stderr.on("data", (chunk) => {
1657
1970
  stderr += chunk;
1971
+ options.onStderrChunk?.(chunk);
1658
1972
  });
1659
1973
  child.stdin.end(options.prompt);
1660
1974
  const cleanup = () => {
@@ -1899,6 +2213,8 @@ function resolveCodexConfig(target, env) {
1899
2213
  const argsSource = settings.args ?? settings.arguments;
1900
2214
  const cwdSource = settings.cwd;
1901
2215
  const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
2216
+ const logDirSource = settings.log_dir ?? settings.logDir ?? settings.log_directory ?? settings.logDirectory;
2217
+ const logFormatSource = settings.log_format ?? settings.logFormat ?? settings.log_output_format ?? settings.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
1902
2218
  const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
1903
2219
  allowLiteral: true,
1904
2220
  optionalEnv: true
@@ -1909,13 +2225,33 @@ function resolveCodexConfig(target, env) {
1909
2225
  optionalEnv: true
1910
2226
  });
1911
2227
  const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
2228
+ const logDir = resolveOptionalString(logDirSource, env, `${target.name} codex log directory`, {
2229
+ allowLiteral: true,
2230
+ optionalEnv: true
2231
+ });
2232
+ const logFormat = normalizeCodexLogFormat(logFormatSource);
1912
2233
  return {
1913
2234
  executable,
1914
2235
  args,
1915
2236
  cwd,
1916
- timeoutMs
2237
+ timeoutMs,
2238
+ logDir,
2239
+ logFormat
1917
2240
  };
1918
2241
  }
2242
+ function normalizeCodexLogFormat(value) {
2243
+ if (value === void 0 || value === null) {
2244
+ return void 0;
2245
+ }
2246
+ if (typeof value !== "string") {
2247
+ throw new Error("codex log format must be 'summary' or 'json'");
2248
+ }
2249
+ const normalized = value.trim().toLowerCase();
2250
+ if (normalized === "json" || normalized === "summary") {
2251
+ return normalized;
2252
+ }
2253
+ throw new Error("codex log format must be 'summary' or 'json'");
2254
+ }
1919
2255
  function resolveMockConfig(target) {
1920
2256
  const settings = target.settings ?? {};
1921
2257
  const response = typeof settings.response === "string" ? settings.response : void 0;
@@ -2550,7 +2886,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
2550
2886
  }
2551
2887
 
2552
2888
  // src/evaluation/evaluators.ts
2553
- var import_node_crypto = require("crypto");
2889
+ var import_ax3 = require("@ax-llm/ax");
2890
+ var import_node_crypto2 = require("crypto");
2891
+ var LLM_JUDGE_SIGNATURE = (0, import_ax3.f)().input(
2892
+ "evaluationContext",
2893
+ import_ax3.f.object(
2894
+ {
2895
+ expectedOutcome: import_ax3.f.string("The expected outcome for the original task"),
2896
+ request: import_ax3.f.string("The original task request"),
2897
+ referenceAnswer: import_ax3.f.string("The gold standard reference answer"),
2898
+ generatedAnswer: import_ax3.f.string("The answer to evaluate"),
2899
+ guidelines: import_ax3.f.string("Additional evaluation guidelines or instructions").optional()
2900
+ },
2901
+ "Complete evaluation context for the judge"
2902
+ )
2903
+ ).output(
2904
+ "evaluation",
2905
+ import_ax3.f.object({
2906
+ score: import_ax3.f.number("Score between 0.0 and 1.0").min(0).max(1),
2907
+ hits: import_ax3.f.string("Brief specific achievement").array(),
2908
+ misses: import_ax3.f.string("Brief specific failure or omission").array(),
2909
+ reasoning: import_ax3.f.string("Concise explanation for the score").max(500)
2910
+ })
2911
+ ).build();
2912
+ var LLM_JUDGE = (0, import_ax3.ax)(LLM_JUDGE_SIGNATURE);
2554
2913
  var LlmJudgeEvaluator = class {
2555
2914
  kind = "llm_judge";
2556
2915
  resolveJudgeProvider;
@@ -2568,6 +2927,44 @@ var LlmJudgeEvaluator = class {
2568
2927
  if (!judgeProvider) {
2569
2928
  throw new Error("No judge provider available for LLM grading");
2570
2929
  }
2930
+ if (providerSupportsAx(judgeProvider)) {
2931
+ return this.evaluateWithAx(context, judgeProvider);
2932
+ }
2933
+ return this.evaluateWithPrompt(context, judgeProvider);
2934
+ }
2935
+ async evaluateWithAx(context, judgeProvider) {
2936
+ const ai = judgeProvider.getAxAI();
2937
+ const guidelines = context.promptInputs.guidelines?.trim();
2938
+ const evaluationContext = {
2939
+ expectedOutcome: context.evalCase.outcome.trim(),
2940
+ request: context.evalCase.task.trim(),
2941
+ referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
2942
+ generatedAnswer: context.candidate.trim(),
2943
+ ...guidelines ? { guidelines } : {}
2944
+ };
2945
+ const options = this.buildJudgeForwardOptions(context);
2946
+ const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
2947
+ const evaluation = result.evaluation;
2948
+ const expectedAspectCount = Math.max(
2949
+ evaluation.hits.length + evaluation.misses.length,
2950
+ 1
2951
+ );
2952
+ return {
2953
+ score: evaluation.score,
2954
+ hits: evaluation.hits,
2955
+ misses: evaluation.misses,
2956
+ expectedAspectCount,
2957
+ reasoning: evaluation.reasoning,
2958
+ evaluatorRawRequest: {
2959
+ id: (0, import_node_crypto2.randomUUID)(),
2960
+ provider: judgeProvider.id,
2961
+ target: context.target.name,
2962
+ method: "ax-structured-output",
2963
+ signature: LLM_JUDGE_SIGNATURE.toString()
2964
+ }
2965
+ };
2966
+ }
2967
+ async evaluateWithPrompt(context, judgeProvider) {
2571
2968
  const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2572
2969
  const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2573
2970
  const metadata = {
@@ -2587,8 +2984,9 @@ var LlmJudgeEvaluator = class {
2587
2984
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
2588
2985
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
2589
2986
  const reasoning = parsed.reasoning ?? response.reasoning;
2987
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
2590
2988
  const evaluatorRawRequest = {
2591
- id: (0, import_node_crypto.randomUUID)(),
2989
+ id: (0, import_node_crypto2.randomUUID)(),
2592
2990
  provider: judgeProvider.id,
2593
2991
  prompt,
2594
2992
  target: context.target.name,
@@ -2599,12 +2997,34 @@ var LlmJudgeEvaluator = class {
2599
2997
  score,
2600
2998
  hits,
2601
2999
  misses,
2602
- expectedAspectCount: hits.length + misses.length || 1,
3000
+ expectedAspectCount,
2603
3001
  reasoning,
2604
3002
  evaluatorRawRequest
2605
3003
  };
2606
3004
  }
3005
+ buildJudgeForwardOptions(context) {
3006
+ const modelConfig = this.buildJudgeModelConfig();
3007
+ if (modelConfig === void 0 && context.judgeModel === void 0) {
3008
+ return void 0;
3009
+ }
3010
+ return {
3011
+ ...context.judgeModel ? { model: context.judgeModel } : {},
3012
+ ...modelConfig ? { modelConfig } : {}
3013
+ };
3014
+ }
3015
+ buildJudgeModelConfig() {
3016
+ if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
3017
+ return void 0;
3018
+ }
3019
+ return {
3020
+ ...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
3021
+ ...this.temperature !== void 0 ? { temperature: this.temperature } : {}
3022
+ };
3023
+ }
2607
3024
  };
3025
+ function providerSupportsAx(provider) {
3026
+ return typeof provider.getAxAI === "function";
3027
+ }
2608
3028
  var QUALITY_SYSTEM_PROMPT = [
2609
3029
  "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
2610
3030
  "",
@@ -2827,7 +3247,7 @@ function parseJsonSafe(payload) {
2827
3247
  }
2828
3248
 
2829
3249
  // src/evaluation/orchestrator.ts
2830
- var import_node_crypto2 = require("crypto");
3250
+ var import_node_crypto3 = require("crypto");
2831
3251
  var import_promises6 = require("fs/promises");
2832
3252
  var import_node_path8 = __toESM(require("path"), 1);
2833
3253
 
@@ -3375,6 +3795,7 @@ async function evaluateCandidate(options) {
3375
3795
  };
3376
3796
  return {
3377
3797
  eval_id: evalCase.id,
3798
+ dataset: evalCase.dataset,
3378
3799
  conversation_id: evalCase.conversation_id,
3379
3800
  score: score.score,
3380
3801
  hits: score.hits,
@@ -3551,7 +3972,7 @@ async function runLlmJudgeEvaluator(options) {
3551
3972
  async function resolveCustomPrompt(config) {
3552
3973
  if (config.promptPath) {
3553
3974
  try {
3554
- return await (0, import_promises6.readFile)(config.promptPath, "utf8");
3975
+ return await readTextFile(config.promptPath);
3555
3976
  } catch (error) {
3556
3977
  const message = error instanceof Error ? error.message : String(error);
3557
3978
  console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
@@ -3600,7 +4021,7 @@ function sanitizeFilename(value) {
3600
4021
  return "prompt";
3601
4022
  }
3602
4023
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3603
- return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
4024
+ return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
3604
4025
  }
3605
4026
  async function invokeProvider(provider, options) {
3606
4027
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -3639,6 +4060,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
3639
4060
  };
3640
4061
  return {
3641
4062
  eval_id: evalCase.id,
4063
+ dataset: evalCase.dataset,
3642
4064
  conversation_id: evalCase.conversation_id,
3643
4065
  score: 0,
3644
4066
  hits: [],
@@ -3652,7 +4074,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
3652
4074
  };
3653
4075
  }
3654
4076
  function createCacheKey(provider, target, evalCase, promptInputs) {
3655
- const hash = (0, import_node_crypto2.createHash)("sha256");
4077
+ const hash = (0, import_node_crypto3.createHash)("sha256");
3656
4078
  hash.update(provider.id);
3657
4079
  hash.update(target.name);
3658
4080
  hash.update(evalCase.id);
@@ -3689,6 +4111,7 @@ function createAgentKernel() {
3689
4111
  buildDirectoryChain,
3690
4112
  buildPromptInputs,
3691
4113
  buildSearchRoots,
4114
+ consumeCodexLogEntries,
3692
4115
  createAgentKernel,
3693
4116
  createProvider,
3694
4117
  ensureVSCodeSubagents,
@@ -3705,10 +4128,12 @@ function createAgentKernel() {
3705
4128
  listTargetNames,
3706
4129
  loadEvalCases,
3707
4130
  readTargetDefinitions,
4131
+ readTextFile,
3708
4132
  resolveAndCreateProvider,
3709
4133
  resolveFileReference,
3710
4134
  resolveTargetDefinition,
3711
4135
  runEvalCase,
3712
- runEvaluation
4136
+ runEvaluation,
4137
+ subscribeToCodexLogEntries
3713
4138
  });
3714
4139
  //# sourceMappingURL=index.cjs.map