@agentv/core 0.5.1 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-NL7K4CAK.js → chunk-OW3SHBIJ.js} +7 -2
- package/dist/chunk-OW3SHBIJ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +439 -14
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +30 -2
- package/dist/index.d.ts +30 -2
- package/dist/index.js +434 -15
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-NL7K4CAK.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -36,6 +36,7 @@ __export(index_exports, {
|
|
|
36
36
|
buildDirectoryChain: () => buildDirectoryChain,
|
|
37
37
|
buildPromptInputs: () => buildPromptInputs,
|
|
38
38
|
buildSearchRoots: () => buildSearchRoots,
|
|
39
|
+
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
39
40
|
createAgentKernel: () => createAgentKernel,
|
|
40
41
|
createProvider: () => createProvider,
|
|
41
42
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
@@ -52,11 +53,13 @@ __export(index_exports, {
|
|
|
52
53
|
listTargetNames: () => listTargetNames,
|
|
53
54
|
loadEvalCases: () => loadEvalCases,
|
|
54
55
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
56
|
+
readTextFile: () => readTextFile,
|
|
55
57
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
56
58
|
resolveFileReference: () => resolveFileReference,
|
|
57
59
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
58
60
|
runEvalCase: () => runEvalCase,
|
|
59
|
-
runEvaluation: () => runEvaluation
|
|
61
|
+
runEvaluation: () => runEvaluation,
|
|
62
|
+
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
|
|
60
63
|
});
|
|
61
64
|
module.exports = __toCommonJS(index_exports);
|
|
62
65
|
|
|
@@ -130,6 +133,10 @@ async function fileExists(filePath) {
|
|
|
130
133
|
return false;
|
|
131
134
|
}
|
|
132
135
|
}
|
|
136
|
+
async function readTextFile(filePath) {
|
|
137
|
+
const content = await (0, import_promises.readFile)(filePath, "utf8");
|
|
138
|
+
return content.replace(/\r\n/g, "\n");
|
|
139
|
+
}
|
|
133
140
|
async function findGitRoot(startPath) {
|
|
134
141
|
let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
|
|
135
142
|
const root = import_node_path.default.parse(currentDir).root;
|
|
@@ -308,6 +315,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
308
315
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
309
316
|
}
|
|
310
317
|
const suite = parsed;
|
|
318
|
+
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
319
|
+
const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
320
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
311
321
|
const schema = suite.$schema;
|
|
312
322
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
313
323
|
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
@@ -455,6 +465,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
455
465
|
];
|
|
456
466
|
const testCase = {
|
|
457
467
|
id,
|
|
468
|
+
dataset: datasetName,
|
|
458
469
|
conversation_id: conversationId,
|
|
459
470
|
task: userTextPrompt,
|
|
460
471
|
user_segments: userSegments,
|
|
@@ -835,6 +846,9 @@ var AzureProvider = class {
|
|
|
835
846
|
);
|
|
836
847
|
return mapResponse(ensureChatResponse(response));
|
|
837
848
|
}
|
|
849
|
+
getAxAI() {
|
|
850
|
+
return this.ai;
|
|
851
|
+
}
|
|
838
852
|
};
|
|
839
853
|
var AnthropicProvider = class {
|
|
840
854
|
constructor(targetName, config) {
|
|
@@ -869,6 +883,9 @@ var AnthropicProvider = class {
|
|
|
869
883
|
);
|
|
870
884
|
return mapResponse(ensureChatResponse(response));
|
|
871
885
|
}
|
|
886
|
+
getAxAI() {
|
|
887
|
+
return this.ai;
|
|
888
|
+
}
|
|
872
889
|
};
|
|
873
890
|
var GeminiProvider = class {
|
|
874
891
|
constructor(targetName, config) {
|
|
@@ -902,6 +919,9 @@ var GeminiProvider = class {
|
|
|
902
919
|
);
|
|
903
920
|
return mapResponse(ensureChatResponse(response));
|
|
904
921
|
}
|
|
922
|
+
getAxAI() {
|
|
923
|
+
return this.ai;
|
|
924
|
+
}
|
|
905
925
|
};
|
|
906
926
|
|
|
907
927
|
// src/evaluation/providers/cli.ts
|
|
@@ -1114,6 +1134,7 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
1114
1134
|
|
|
1115
1135
|
// src/evaluation/providers/codex.ts
|
|
1116
1136
|
var import_node_child_process2 = require("child_process");
|
|
1137
|
+
var import_node_crypto = require("crypto");
|
|
1117
1138
|
var import_node_fs3 = require("fs");
|
|
1118
1139
|
var import_promises3 = require("fs/promises");
|
|
1119
1140
|
var import_node_os = require("os");
|
|
@@ -1221,6 +1242,59 @@ function pathToFileUri(filePath) {
|
|
|
1221
1242
|
return `file://${normalizedPath}`;
|
|
1222
1243
|
}
|
|
1223
1244
|
|
|
1245
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1246
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1247
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1248
|
+
function getCodexLogStore() {
|
|
1249
|
+
const globalObject = globalThis;
|
|
1250
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1251
|
+
if (existing) {
|
|
1252
|
+
return existing;
|
|
1253
|
+
}
|
|
1254
|
+
const created = [];
|
|
1255
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1256
|
+
return created;
|
|
1257
|
+
}
|
|
1258
|
+
function getSubscriberStore() {
|
|
1259
|
+
const globalObject = globalThis;
|
|
1260
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1261
|
+
if (existing) {
|
|
1262
|
+
return existing;
|
|
1263
|
+
}
|
|
1264
|
+
const created = /* @__PURE__ */ new Set();
|
|
1265
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1266
|
+
return created;
|
|
1267
|
+
}
|
|
1268
|
+
function notifySubscribers(entry) {
|
|
1269
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1270
|
+
for (const listener of subscribers) {
|
|
1271
|
+
try {
|
|
1272
|
+
listener(entry);
|
|
1273
|
+
} catch (error) {
|
|
1274
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1275
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1276
|
+
}
|
|
1277
|
+
}
|
|
1278
|
+
}
|
|
1279
|
+
function recordCodexLogEntry(entry) {
|
|
1280
|
+
getCodexLogStore().push(entry);
|
|
1281
|
+
notifySubscribers(entry);
|
|
1282
|
+
}
|
|
1283
|
+
function consumeCodexLogEntries() {
|
|
1284
|
+
const store = getCodexLogStore();
|
|
1285
|
+
if (store.length === 0) {
|
|
1286
|
+
return [];
|
|
1287
|
+
}
|
|
1288
|
+
return store.splice(0, store.length);
|
|
1289
|
+
}
|
|
1290
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1291
|
+
const store = getSubscriberStore();
|
|
1292
|
+
store.add(listener);
|
|
1293
|
+
return () => {
|
|
1294
|
+
store.delete(listener);
|
|
1295
|
+
};
|
|
1296
|
+
}
|
|
1297
|
+
|
|
1224
1298
|
// src/evaluation/providers/codex.ts
|
|
1225
1299
|
var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
|
|
1226
1300
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
@@ -1252,6 +1326,7 @@ var CodexProvider = class {
|
|
|
1252
1326
|
collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => import_node_path5.default.resolve(file))
|
|
1253
1327
|
);
|
|
1254
1328
|
const workspaceRoot = await this.createWorkspace();
|
|
1329
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1255
1330
|
try {
|
|
1256
1331
|
const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
|
|
1257
1332
|
inputFiles,
|
|
@@ -1266,7 +1341,7 @@ var CodexProvider = class {
|
|
|
1266
1341
|
await (0, import_promises3.writeFile)(promptFile, promptContent, "utf8");
|
|
1267
1342
|
const args = this.buildCodexArgs();
|
|
1268
1343
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
1269
|
-
const result = await this.executeCodex(args, cwd, promptContent, request.signal);
|
|
1344
|
+
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
1270
1345
|
if (result.timedOut) {
|
|
1271
1346
|
throw new Error(
|
|
1272
1347
|
`Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
@@ -1290,10 +1365,12 @@ var CodexProvider = class {
|
|
|
1290
1365
|
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1291
1366
|
promptFile,
|
|
1292
1367
|
workspace: workspaceRoot,
|
|
1293
|
-
inputFiles: mirroredInputFiles
|
|
1368
|
+
inputFiles: mirroredInputFiles,
|
|
1369
|
+
logFile: logger?.filePath
|
|
1294
1370
|
}
|
|
1295
1371
|
};
|
|
1296
1372
|
} finally {
|
|
1373
|
+
await logger?.close();
|
|
1297
1374
|
await this.cleanupWorkspace(workspaceRoot);
|
|
1298
1375
|
}
|
|
1299
1376
|
}
|
|
@@ -1320,7 +1397,7 @@ var CodexProvider = class {
|
|
|
1320
1397
|
args.push("-");
|
|
1321
1398
|
return args;
|
|
1322
1399
|
}
|
|
1323
|
-
async executeCodex(args, cwd, promptContent, signal) {
|
|
1400
|
+
async executeCodex(args, cwd, promptContent, signal, logger) {
|
|
1324
1401
|
try {
|
|
1325
1402
|
return await this.runCodex({
|
|
1326
1403
|
executable: this.resolvedExecutable ?? this.config.executable,
|
|
@@ -1329,7 +1406,9 @@ var CodexProvider = class {
|
|
|
1329
1406
|
prompt: promptContent,
|
|
1330
1407
|
timeoutMs: this.config.timeoutMs,
|
|
1331
1408
|
env: process.env,
|
|
1332
|
-
signal
|
|
1409
|
+
signal,
|
|
1410
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
1411
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
1333
1412
|
});
|
|
1334
1413
|
} catch (error) {
|
|
1335
1414
|
const err = error;
|
|
@@ -1381,7 +1460,240 @@ var CodexProvider = class {
|
|
|
1381
1460
|
} catch {
|
|
1382
1461
|
}
|
|
1383
1462
|
}
|
|
1463
|
+
resolveLogDirectory() {
|
|
1464
|
+
const disabled = isCodexLogStreamingDisabled();
|
|
1465
|
+
if (disabled) {
|
|
1466
|
+
return void 0;
|
|
1467
|
+
}
|
|
1468
|
+
if (this.config.logDir) {
|
|
1469
|
+
return import_node_path5.default.resolve(this.config.logDir);
|
|
1470
|
+
}
|
|
1471
|
+
return import_node_path5.default.join(process.cwd(), ".agentv", "logs", "codex");
|
|
1472
|
+
}
|
|
1473
|
+
async createStreamLogger(request) {
|
|
1474
|
+
const logDir = this.resolveLogDirectory();
|
|
1475
|
+
if (!logDir) {
|
|
1476
|
+
return void 0;
|
|
1477
|
+
}
|
|
1478
|
+
try {
|
|
1479
|
+
await (0, import_promises3.mkdir)(logDir, { recursive: true });
|
|
1480
|
+
} catch (error) {
|
|
1481
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1482
|
+
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
1483
|
+
return void 0;
|
|
1484
|
+
}
|
|
1485
|
+
const filePath = import_node_path5.default.join(logDir, buildLogFilename(request, this.targetName));
|
|
1486
|
+
try {
|
|
1487
|
+
const logger = await CodexStreamLogger.create({
|
|
1488
|
+
filePath,
|
|
1489
|
+
targetName: this.targetName,
|
|
1490
|
+
evalCaseId: request.evalCaseId,
|
|
1491
|
+
attempt: request.attempt,
|
|
1492
|
+
format: this.config.logFormat ?? "summary"
|
|
1493
|
+
});
|
|
1494
|
+
recordCodexLogEntry({
|
|
1495
|
+
filePath,
|
|
1496
|
+
targetName: this.targetName,
|
|
1497
|
+
evalCaseId: request.evalCaseId,
|
|
1498
|
+
attempt: request.attempt
|
|
1499
|
+
});
|
|
1500
|
+
return logger;
|
|
1501
|
+
} catch (error) {
|
|
1502
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1503
|
+
console.warn(`Skipping Codex stream logging for ${filePath}: ${message}`);
|
|
1504
|
+
return void 0;
|
|
1505
|
+
}
|
|
1506
|
+
}
|
|
1507
|
+
};
|
|
1508
|
+
var CodexStreamLogger = class _CodexStreamLogger {
|
|
1509
|
+
filePath;
|
|
1510
|
+
stream;
|
|
1511
|
+
startedAt = Date.now();
|
|
1512
|
+
stdoutBuffer = "";
|
|
1513
|
+
stderrBuffer = "";
|
|
1514
|
+
format;
|
|
1515
|
+
constructor(filePath, format) {
|
|
1516
|
+
this.filePath = filePath;
|
|
1517
|
+
this.format = format;
|
|
1518
|
+
this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
|
|
1519
|
+
}
|
|
1520
|
+
static async create(options) {
|
|
1521
|
+
const logger = new _CodexStreamLogger(options.filePath, options.format);
|
|
1522
|
+
const header = [
|
|
1523
|
+
"# Codex CLI stream log",
|
|
1524
|
+
`# target: ${options.targetName}`,
|
|
1525
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
1526
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
1527
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
1528
|
+
""
|
|
1529
|
+
].filter((line) => Boolean(line));
|
|
1530
|
+
logger.writeLines(header);
|
|
1531
|
+
return logger;
|
|
1532
|
+
}
|
|
1533
|
+
handleStdoutChunk(chunk) {
|
|
1534
|
+
this.stdoutBuffer += chunk;
|
|
1535
|
+
this.flushBuffer("stdout");
|
|
1536
|
+
}
|
|
1537
|
+
handleStderrChunk(chunk) {
|
|
1538
|
+
this.stderrBuffer += chunk;
|
|
1539
|
+
this.flushBuffer("stderr");
|
|
1540
|
+
}
|
|
1541
|
+
async close() {
|
|
1542
|
+
this.flushBuffer("stdout");
|
|
1543
|
+
this.flushBuffer("stderr");
|
|
1544
|
+
this.flushRemainder();
|
|
1545
|
+
await new Promise((resolve, reject) => {
|
|
1546
|
+
this.stream.once("error", reject);
|
|
1547
|
+
this.stream.end(() => resolve());
|
|
1548
|
+
});
|
|
1549
|
+
}
|
|
1550
|
+
writeLines(lines) {
|
|
1551
|
+
for (const line of lines) {
|
|
1552
|
+
this.stream.write(`${line}
|
|
1553
|
+
`);
|
|
1554
|
+
}
|
|
1555
|
+
}
|
|
1556
|
+
flushBuffer(source) {
|
|
1557
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
1558
|
+
const lines = buffer.split(/\r?\n/);
|
|
1559
|
+
const remainder = lines.pop() ?? "";
|
|
1560
|
+
if (source === "stdout") {
|
|
1561
|
+
this.stdoutBuffer = remainder;
|
|
1562
|
+
} else {
|
|
1563
|
+
this.stderrBuffer = remainder;
|
|
1564
|
+
}
|
|
1565
|
+
for (const line of lines) {
|
|
1566
|
+
const formatted = this.formatLine(line, source);
|
|
1567
|
+
if (formatted) {
|
|
1568
|
+
this.stream.write(formatted);
|
|
1569
|
+
this.stream.write("\n");
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1572
|
+
}
|
|
1573
|
+
formatLine(rawLine, source) {
|
|
1574
|
+
const trimmed = rawLine.trim();
|
|
1575
|
+
if (trimmed.length === 0) {
|
|
1576
|
+
return void 0;
|
|
1577
|
+
}
|
|
1578
|
+
const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
|
|
1579
|
+
return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
|
|
1580
|
+
}
|
|
1581
|
+
flushRemainder() {
|
|
1582
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
1583
|
+
if (stdoutRemainder.length > 0) {
|
|
1584
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
1585
|
+
if (formatted) {
|
|
1586
|
+
this.stream.write(formatted);
|
|
1587
|
+
this.stream.write("\n");
|
|
1588
|
+
}
|
|
1589
|
+
}
|
|
1590
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
1591
|
+
if (stderrRemainder.length > 0) {
|
|
1592
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
1593
|
+
if (formatted) {
|
|
1594
|
+
this.stream.write(formatted);
|
|
1595
|
+
this.stream.write("\n");
|
|
1596
|
+
}
|
|
1597
|
+
}
|
|
1598
|
+
this.stdoutBuffer = "";
|
|
1599
|
+
this.stderrBuffer = "";
|
|
1600
|
+
}
|
|
1384
1601
|
};
|
|
1602
|
+
function isCodexLogStreamingDisabled() {
|
|
1603
|
+
const envValue = process.env.AGENTV_CODEX_STREAM_LOGS;
|
|
1604
|
+
if (!envValue) {
|
|
1605
|
+
return false;
|
|
1606
|
+
}
|
|
1607
|
+
const normalized = envValue.trim().toLowerCase();
|
|
1608
|
+
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
1609
|
+
}
|
|
1610
|
+
function buildLogFilename(request, targetName) {
|
|
1611
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1612
|
+
const evalId = sanitizeForFilename(request.evalCaseId ?? "codex");
|
|
1613
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
1614
|
+
const target = sanitizeForFilename(targetName);
|
|
1615
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
|
|
1616
|
+
}
|
|
1617
|
+
function sanitizeForFilename(value) {
|
|
1618
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
1619
|
+
return sanitized.length > 0 ? sanitized : "codex";
|
|
1620
|
+
}
|
|
1621
|
+
function formatElapsed(startedAt) {
|
|
1622
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
1623
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
1624
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
1625
|
+
const seconds = elapsedSeconds % 60;
|
|
1626
|
+
if (hours > 0) {
|
|
1627
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
1628
|
+
}
|
|
1629
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
1630
|
+
}
|
|
1631
|
+
function formatCodexLogMessage(rawLine, source) {
|
|
1632
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
1633
|
+
if (parsed) {
|
|
1634
|
+
const summary = summarizeCodexEvent(parsed);
|
|
1635
|
+
if (summary) {
|
|
1636
|
+
return summary;
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
if (source === "stderr") {
|
|
1640
|
+
return `stderr: ${rawLine}`;
|
|
1641
|
+
}
|
|
1642
|
+
return rawLine;
|
|
1643
|
+
}
|
|
1644
|
+
function formatCodexJsonLog(rawLine) {
|
|
1645
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
1646
|
+
if (!parsed) {
|
|
1647
|
+
return rawLine;
|
|
1648
|
+
}
|
|
1649
|
+
try {
|
|
1650
|
+
return JSON.stringify(parsed, null, 2);
|
|
1651
|
+
} catch {
|
|
1652
|
+
return rawLine;
|
|
1653
|
+
}
|
|
1654
|
+
}
|
|
1655
|
+
function summarizeCodexEvent(event) {
|
|
1656
|
+
if (!event || typeof event !== "object") {
|
|
1657
|
+
return void 0;
|
|
1658
|
+
}
|
|
1659
|
+
const record = event;
|
|
1660
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
1661
|
+
let message = extractFromEvent(event) ?? extractFromItem(record.item) ?? flattenContent(record.output ?? record.content);
|
|
1662
|
+
if (!message && type === JSONL_TYPE_ITEM_COMPLETED) {
|
|
1663
|
+
const item = record.item;
|
|
1664
|
+
if (item && typeof item === "object") {
|
|
1665
|
+
const candidate = flattenContent(
|
|
1666
|
+
item.text ?? item.content ?? item.output
|
|
1667
|
+
);
|
|
1668
|
+
if (candidate) {
|
|
1669
|
+
message = candidate;
|
|
1670
|
+
}
|
|
1671
|
+
}
|
|
1672
|
+
}
|
|
1673
|
+
if (!message) {
|
|
1674
|
+
const itemType = typeof record.item?.type === "string" ? record.item.type : void 0;
|
|
1675
|
+
if (type && itemType) {
|
|
1676
|
+
return `${type}:${itemType}`;
|
|
1677
|
+
}
|
|
1678
|
+
if (type) {
|
|
1679
|
+
return type;
|
|
1680
|
+
}
|
|
1681
|
+
}
|
|
1682
|
+
if (type && message) {
|
|
1683
|
+
return `${type}: ${message}`;
|
|
1684
|
+
}
|
|
1685
|
+
if (message) {
|
|
1686
|
+
return message;
|
|
1687
|
+
}
|
|
1688
|
+
return type;
|
|
1689
|
+
}
|
|
1690
|
+
function tryParseJsonValue(rawLine) {
|
|
1691
|
+
try {
|
|
1692
|
+
return JSON.parse(rawLine);
|
|
1693
|
+
} catch {
|
|
1694
|
+
return void 0;
|
|
1695
|
+
}
|
|
1696
|
+
}
|
|
1385
1697
|
async function locateExecutable(candidate) {
|
|
1386
1698
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1387
1699
|
if (includesPathSeparator) {
|
|
@@ -1651,10 +1963,12 @@ async function defaultCodexRunner(options) {
|
|
|
1651
1963
|
child.stdout.setEncoding("utf8");
|
|
1652
1964
|
child.stdout.on("data", (chunk) => {
|
|
1653
1965
|
stdout += chunk;
|
|
1966
|
+
options.onStdoutChunk?.(chunk);
|
|
1654
1967
|
});
|
|
1655
1968
|
child.stderr.setEncoding("utf8");
|
|
1656
1969
|
child.stderr.on("data", (chunk) => {
|
|
1657
1970
|
stderr += chunk;
|
|
1971
|
+
options.onStderrChunk?.(chunk);
|
|
1658
1972
|
});
|
|
1659
1973
|
child.stdin.end(options.prompt);
|
|
1660
1974
|
const cleanup = () => {
|
|
@@ -1899,6 +2213,8 @@ function resolveCodexConfig(target, env) {
|
|
|
1899
2213
|
const argsSource = settings.args ?? settings.arguments;
|
|
1900
2214
|
const cwdSource = settings.cwd;
|
|
1901
2215
|
const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
|
|
2216
|
+
const logDirSource = settings.log_dir ?? settings.logDir ?? settings.log_directory ?? settings.logDirectory;
|
|
2217
|
+
const logFormatSource = settings.log_format ?? settings.logFormat ?? settings.log_output_format ?? settings.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
1902
2218
|
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
1903
2219
|
allowLiteral: true,
|
|
1904
2220
|
optionalEnv: true
|
|
@@ -1909,13 +2225,33 @@ function resolveCodexConfig(target, env) {
|
|
|
1909
2225
|
optionalEnv: true
|
|
1910
2226
|
});
|
|
1911
2227
|
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
|
|
2228
|
+
const logDir = resolveOptionalString(logDirSource, env, `${target.name} codex log directory`, {
|
|
2229
|
+
allowLiteral: true,
|
|
2230
|
+
optionalEnv: true
|
|
2231
|
+
});
|
|
2232
|
+
const logFormat = normalizeCodexLogFormat(logFormatSource);
|
|
1912
2233
|
return {
|
|
1913
2234
|
executable,
|
|
1914
2235
|
args,
|
|
1915
2236
|
cwd,
|
|
1916
|
-
timeoutMs
|
|
2237
|
+
timeoutMs,
|
|
2238
|
+
logDir,
|
|
2239
|
+
logFormat
|
|
1917
2240
|
};
|
|
1918
2241
|
}
|
|
2242
|
+
function normalizeCodexLogFormat(value) {
|
|
2243
|
+
if (value === void 0 || value === null) {
|
|
2244
|
+
return void 0;
|
|
2245
|
+
}
|
|
2246
|
+
if (typeof value !== "string") {
|
|
2247
|
+
throw new Error("codex log format must be 'summary' or 'json'");
|
|
2248
|
+
}
|
|
2249
|
+
const normalized = value.trim().toLowerCase();
|
|
2250
|
+
if (normalized === "json" || normalized === "summary") {
|
|
2251
|
+
return normalized;
|
|
2252
|
+
}
|
|
2253
|
+
throw new Error("codex log format must be 'summary' or 'json'");
|
|
2254
|
+
}
|
|
1919
2255
|
function resolveMockConfig(target) {
|
|
1920
2256
|
const settings = target.settings ?? {};
|
|
1921
2257
|
const response = typeof settings.response === "string" ? settings.response : void 0;
|
|
@@ -2550,7 +2886,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2550
2886
|
}
|
|
2551
2887
|
|
|
2552
2888
|
// src/evaluation/evaluators.ts
|
|
2553
|
-
var
|
|
2889
|
+
var import_ax3 = require("@ax-llm/ax");
|
|
2890
|
+
var import_node_crypto2 = require("crypto");
|
|
2891
|
+
var LLM_JUDGE_SIGNATURE = (0, import_ax3.f)().input(
|
|
2892
|
+
"evaluationContext",
|
|
2893
|
+
import_ax3.f.object(
|
|
2894
|
+
{
|
|
2895
|
+
expectedOutcome: import_ax3.f.string("The expected outcome for the original task"),
|
|
2896
|
+
request: import_ax3.f.string("The original task request"),
|
|
2897
|
+
referenceAnswer: import_ax3.f.string("The gold standard reference answer"),
|
|
2898
|
+
generatedAnswer: import_ax3.f.string("The answer to evaluate"),
|
|
2899
|
+
guidelines: import_ax3.f.string("Additional evaluation guidelines or instructions").optional()
|
|
2900
|
+
},
|
|
2901
|
+
"Complete evaluation context for the judge"
|
|
2902
|
+
)
|
|
2903
|
+
).output(
|
|
2904
|
+
"evaluation",
|
|
2905
|
+
import_ax3.f.object({
|
|
2906
|
+
score: import_ax3.f.number("Score between 0.0 and 1.0").min(0).max(1),
|
|
2907
|
+
hits: import_ax3.f.string("Brief specific achievement").array(),
|
|
2908
|
+
misses: import_ax3.f.string("Brief specific failure or omission").array(),
|
|
2909
|
+
reasoning: import_ax3.f.string("Concise explanation for the score").max(500)
|
|
2910
|
+
})
|
|
2911
|
+
).build();
|
|
2912
|
+
var LLM_JUDGE = (0, import_ax3.ax)(LLM_JUDGE_SIGNATURE);
|
|
2554
2913
|
var LlmJudgeEvaluator = class {
|
|
2555
2914
|
kind = "llm_judge";
|
|
2556
2915
|
resolveJudgeProvider;
|
|
@@ -2568,6 +2927,44 @@ var LlmJudgeEvaluator = class {
|
|
|
2568
2927
|
if (!judgeProvider) {
|
|
2569
2928
|
throw new Error("No judge provider available for LLM grading");
|
|
2570
2929
|
}
|
|
2930
|
+
if (providerSupportsAx(judgeProvider)) {
|
|
2931
|
+
return this.evaluateWithAx(context, judgeProvider);
|
|
2932
|
+
}
|
|
2933
|
+
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2934
|
+
}
|
|
2935
|
+
async evaluateWithAx(context, judgeProvider) {
|
|
2936
|
+
const ai = judgeProvider.getAxAI();
|
|
2937
|
+
const guidelines = context.promptInputs.guidelines?.trim();
|
|
2938
|
+
const evaluationContext = {
|
|
2939
|
+
expectedOutcome: context.evalCase.outcome.trim(),
|
|
2940
|
+
request: context.evalCase.task.trim(),
|
|
2941
|
+
referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
|
|
2942
|
+
generatedAnswer: context.candidate.trim(),
|
|
2943
|
+
...guidelines ? { guidelines } : {}
|
|
2944
|
+
};
|
|
2945
|
+
const options = this.buildJudgeForwardOptions(context);
|
|
2946
|
+
const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
|
|
2947
|
+
const evaluation = result.evaluation;
|
|
2948
|
+
const expectedAspectCount = Math.max(
|
|
2949
|
+
evaluation.hits.length + evaluation.misses.length,
|
|
2950
|
+
1
|
|
2951
|
+
);
|
|
2952
|
+
return {
|
|
2953
|
+
score: evaluation.score,
|
|
2954
|
+
hits: evaluation.hits,
|
|
2955
|
+
misses: evaluation.misses,
|
|
2956
|
+
expectedAspectCount,
|
|
2957
|
+
reasoning: evaluation.reasoning,
|
|
2958
|
+
evaluatorRawRequest: {
|
|
2959
|
+
id: (0, import_node_crypto2.randomUUID)(),
|
|
2960
|
+
provider: judgeProvider.id,
|
|
2961
|
+
target: context.target.name,
|
|
2962
|
+
method: "ax-structured-output",
|
|
2963
|
+
signature: LLM_JUDGE_SIGNATURE.toString()
|
|
2964
|
+
}
|
|
2965
|
+
};
|
|
2966
|
+
}
|
|
2967
|
+
async evaluateWithPrompt(context, judgeProvider) {
|
|
2571
2968
|
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2572
2969
|
const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2573
2970
|
const metadata = {
|
|
@@ -2587,8 +2984,9 @@ var LlmJudgeEvaluator = class {
|
|
|
2587
2984
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2588
2985
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2589
2986
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
2987
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
2590
2988
|
const evaluatorRawRequest = {
|
|
2591
|
-
id: (0,
|
|
2989
|
+
id: (0, import_node_crypto2.randomUUID)(),
|
|
2592
2990
|
provider: judgeProvider.id,
|
|
2593
2991
|
prompt,
|
|
2594
2992
|
target: context.target.name,
|
|
@@ -2599,12 +2997,34 @@ var LlmJudgeEvaluator = class {
|
|
|
2599
2997
|
score,
|
|
2600
2998
|
hits,
|
|
2601
2999
|
misses,
|
|
2602
|
-
expectedAspectCount
|
|
3000
|
+
expectedAspectCount,
|
|
2603
3001
|
reasoning,
|
|
2604
3002
|
evaluatorRawRequest
|
|
2605
3003
|
};
|
|
2606
3004
|
}
|
|
3005
|
+
buildJudgeForwardOptions(context) {
|
|
3006
|
+
const modelConfig = this.buildJudgeModelConfig();
|
|
3007
|
+
if (modelConfig === void 0 && context.judgeModel === void 0) {
|
|
3008
|
+
return void 0;
|
|
3009
|
+
}
|
|
3010
|
+
return {
|
|
3011
|
+
...context.judgeModel ? { model: context.judgeModel } : {},
|
|
3012
|
+
...modelConfig ? { modelConfig } : {}
|
|
3013
|
+
};
|
|
3014
|
+
}
|
|
3015
|
+
buildJudgeModelConfig() {
|
|
3016
|
+
if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
|
|
3017
|
+
return void 0;
|
|
3018
|
+
}
|
|
3019
|
+
return {
|
|
3020
|
+
...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
|
|
3021
|
+
...this.temperature !== void 0 ? { temperature: this.temperature } : {}
|
|
3022
|
+
};
|
|
3023
|
+
}
|
|
2607
3024
|
};
|
|
3025
|
+
function providerSupportsAx(provider) {
|
|
3026
|
+
return typeof provider.getAxAI === "function";
|
|
3027
|
+
}
|
|
2608
3028
|
var QUALITY_SYSTEM_PROMPT = [
|
|
2609
3029
|
"You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2610
3030
|
"",
|
|
@@ -2827,7 +3247,7 @@ function parseJsonSafe(payload) {
|
|
|
2827
3247
|
}
|
|
2828
3248
|
|
|
2829
3249
|
// src/evaluation/orchestrator.ts
|
|
2830
|
-
var
|
|
3250
|
+
var import_node_crypto3 = require("crypto");
|
|
2831
3251
|
var import_promises6 = require("fs/promises");
|
|
2832
3252
|
var import_node_path8 = __toESM(require("path"), 1);
|
|
2833
3253
|
|
|
@@ -3375,6 +3795,7 @@ async function evaluateCandidate(options) {
|
|
|
3375
3795
|
};
|
|
3376
3796
|
return {
|
|
3377
3797
|
eval_id: evalCase.id,
|
|
3798
|
+
dataset: evalCase.dataset,
|
|
3378
3799
|
conversation_id: evalCase.conversation_id,
|
|
3379
3800
|
score: score.score,
|
|
3380
3801
|
hits: score.hits,
|
|
@@ -3551,7 +3972,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3551
3972
|
async function resolveCustomPrompt(config) {
|
|
3552
3973
|
if (config.promptPath) {
|
|
3553
3974
|
try {
|
|
3554
|
-
return await (
|
|
3975
|
+
return await readTextFile(config.promptPath);
|
|
3555
3976
|
} catch (error) {
|
|
3556
3977
|
const message = error instanceof Error ? error.message : String(error);
|
|
3557
3978
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
@@ -3600,7 +4021,7 @@ function sanitizeFilename(value) {
|
|
|
3600
4021
|
return "prompt";
|
|
3601
4022
|
}
|
|
3602
4023
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3603
|
-
return sanitized.length > 0 ? sanitized : (0,
|
|
4024
|
+
return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
|
|
3604
4025
|
}
|
|
3605
4026
|
async function invokeProvider(provider, options) {
|
|
3606
4027
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -3639,6 +4060,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
3639
4060
|
};
|
|
3640
4061
|
return {
|
|
3641
4062
|
eval_id: evalCase.id,
|
|
4063
|
+
dataset: evalCase.dataset,
|
|
3642
4064
|
conversation_id: evalCase.conversation_id,
|
|
3643
4065
|
score: 0,
|
|
3644
4066
|
hits: [],
|
|
@@ -3652,7 +4074,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
3652
4074
|
};
|
|
3653
4075
|
}
|
|
3654
4076
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
3655
|
-
const hash = (0,
|
|
4077
|
+
const hash = (0, import_node_crypto3.createHash)("sha256");
|
|
3656
4078
|
hash.update(provider.id);
|
|
3657
4079
|
hash.update(target.name);
|
|
3658
4080
|
hash.update(evalCase.id);
|
|
@@ -3689,6 +4111,7 @@ function createAgentKernel() {
|
|
|
3689
4111
|
buildDirectoryChain,
|
|
3690
4112
|
buildPromptInputs,
|
|
3691
4113
|
buildSearchRoots,
|
|
4114
|
+
consumeCodexLogEntries,
|
|
3692
4115
|
createAgentKernel,
|
|
3693
4116
|
createProvider,
|
|
3694
4117
|
ensureVSCodeSubagents,
|
|
@@ -3705,10 +4128,12 @@ function createAgentKernel() {
|
|
|
3705
4128
|
listTargetNames,
|
|
3706
4129
|
loadEvalCases,
|
|
3707
4130
|
readTargetDefinitions,
|
|
4131
|
+
readTextFile,
|
|
3708
4132
|
resolveAndCreateProvider,
|
|
3709
4133
|
resolveFileReference,
|
|
3710
4134
|
resolveTargetDefinition,
|
|
3711
4135
|
runEvalCase,
|
|
3712
|
-
runEvaluation
|
|
4136
|
+
runEvaluation,
|
|
4137
|
+
subscribeToCodexLogEntries
|
|
3713
4138
|
});
|
|
3714
4139
|
//# sourceMappingURL=index.cjs.map
|