@agentv/core 0.5.1 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-NL7K4CAK.js → chunk-OW3SHBIJ.js} +7 -2
- package/dist/chunk-OW3SHBIJ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +439 -14
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +30 -2
- package/dist/index.d.ts +30 -2
- package/dist/index.js +434 -15
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-NL7K4CAK.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -4,8 +4,9 @@ import {
|
|
|
4
4
|
buildSearchRoots,
|
|
5
5
|
fileExists,
|
|
6
6
|
findGitRoot,
|
|
7
|
+
readTextFile,
|
|
7
8
|
resolveFileReference
|
|
8
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-OW3SHBIJ.js";
|
|
9
10
|
|
|
10
11
|
// src/evaluation/types.ts
|
|
11
12
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -149,6 +150,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
149
150
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
150
151
|
}
|
|
151
152
|
const suite = parsed;
|
|
153
|
+
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
154
|
+
const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
155
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
152
156
|
const schema = suite.$schema;
|
|
153
157
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
154
158
|
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
@@ -296,6 +300,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
296
300
|
];
|
|
297
301
|
const testCase = {
|
|
298
302
|
id,
|
|
303
|
+
dataset: datasetName,
|
|
299
304
|
conversation_id: conversationId,
|
|
300
305
|
task: userTextPrompt,
|
|
301
306
|
user_segments: userSegments,
|
|
@@ -676,6 +681,9 @@ var AzureProvider = class {
|
|
|
676
681
|
);
|
|
677
682
|
return mapResponse(ensureChatResponse(response));
|
|
678
683
|
}
|
|
684
|
+
getAxAI() {
|
|
685
|
+
return this.ai;
|
|
686
|
+
}
|
|
679
687
|
};
|
|
680
688
|
var AnthropicProvider = class {
|
|
681
689
|
constructor(targetName, config) {
|
|
@@ -710,6 +718,9 @@ var AnthropicProvider = class {
|
|
|
710
718
|
);
|
|
711
719
|
return mapResponse(ensureChatResponse(response));
|
|
712
720
|
}
|
|
721
|
+
getAxAI() {
|
|
722
|
+
return this.ai;
|
|
723
|
+
}
|
|
713
724
|
};
|
|
714
725
|
var GeminiProvider = class {
|
|
715
726
|
constructor(targetName, config) {
|
|
@@ -743,6 +754,9 @@ var GeminiProvider = class {
|
|
|
743
754
|
);
|
|
744
755
|
return mapResponse(ensureChatResponse(response));
|
|
745
756
|
}
|
|
757
|
+
getAxAI() {
|
|
758
|
+
return this.ai;
|
|
759
|
+
}
|
|
746
760
|
};
|
|
747
761
|
|
|
748
762
|
// src/evaluation/providers/cli.ts
|
|
@@ -955,7 +969,8 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
955
969
|
|
|
956
970
|
// src/evaluation/providers/codex.ts
|
|
957
971
|
import { exec as execCallback, spawn } from "node:child_process";
|
|
958
|
-
import {
|
|
972
|
+
import { randomUUID } from "node:crypto";
|
|
973
|
+
import { constants as constants2, createWriteStream } from "node:fs";
|
|
959
974
|
import { access as access2, copyFile, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
|
|
960
975
|
import { tmpdir } from "node:os";
|
|
961
976
|
import path4 from "node:path";
|
|
@@ -1062,6 +1077,59 @@ function pathToFileUri(filePath) {
|
|
|
1062
1077
|
return `file://${normalizedPath}`;
|
|
1063
1078
|
}
|
|
1064
1079
|
|
|
1080
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1081
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1082
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1083
|
+
function getCodexLogStore() {
|
|
1084
|
+
const globalObject = globalThis;
|
|
1085
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1086
|
+
if (existing) {
|
|
1087
|
+
return existing;
|
|
1088
|
+
}
|
|
1089
|
+
const created = [];
|
|
1090
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1091
|
+
return created;
|
|
1092
|
+
}
|
|
1093
|
+
function getSubscriberStore() {
|
|
1094
|
+
const globalObject = globalThis;
|
|
1095
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1096
|
+
if (existing) {
|
|
1097
|
+
return existing;
|
|
1098
|
+
}
|
|
1099
|
+
const created = /* @__PURE__ */ new Set();
|
|
1100
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1101
|
+
return created;
|
|
1102
|
+
}
|
|
1103
|
+
function notifySubscribers(entry) {
|
|
1104
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1105
|
+
for (const listener of subscribers) {
|
|
1106
|
+
try {
|
|
1107
|
+
listener(entry);
|
|
1108
|
+
} catch (error) {
|
|
1109
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1110
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
function recordCodexLogEntry(entry) {
|
|
1115
|
+
getCodexLogStore().push(entry);
|
|
1116
|
+
notifySubscribers(entry);
|
|
1117
|
+
}
|
|
1118
|
+
function consumeCodexLogEntries() {
|
|
1119
|
+
const store = getCodexLogStore();
|
|
1120
|
+
if (store.length === 0) {
|
|
1121
|
+
return [];
|
|
1122
|
+
}
|
|
1123
|
+
return store.splice(0, store.length);
|
|
1124
|
+
}
|
|
1125
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1126
|
+
const store = getSubscriberStore();
|
|
1127
|
+
store.add(listener);
|
|
1128
|
+
return () => {
|
|
1129
|
+
store.delete(listener);
|
|
1130
|
+
};
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1065
1133
|
// src/evaluation/providers/codex.ts
|
|
1066
1134
|
var execAsync2 = promisify2(execCallback);
|
|
1067
1135
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
@@ -1093,6 +1161,7 @@ var CodexProvider = class {
|
|
|
1093
1161
|
collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => path4.resolve(file))
|
|
1094
1162
|
);
|
|
1095
1163
|
const workspaceRoot = await this.createWorkspace();
|
|
1164
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1096
1165
|
try {
|
|
1097
1166
|
const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
|
|
1098
1167
|
inputFiles,
|
|
@@ -1107,7 +1176,7 @@ var CodexProvider = class {
|
|
|
1107
1176
|
await writeFile(promptFile, promptContent, "utf8");
|
|
1108
1177
|
const args = this.buildCodexArgs();
|
|
1109
1178
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
1110
|
-
const result = await this.executeCodex(args, cwd, promptContent, request.signal);
|
|
1179
|
+
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
1111
1180
|
if (result.timedOut) {
|
|
1112
1181
|
throw new Error(
|
|
1113
1182
|
`Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
|
|
@@ -1131,10 +1200,12 @@ var CodexProvider = class {
|
|
|
1131
1200
|
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1132
1201
|
promptFile,
|
|
1133
1202
|
workspace: workspaceRoot,
|
|
1134
|
-
inputFiles: mirroredInputFiles
|
|
1203
|
+
inputFiles: mirroredInputFiles,
|
|
1204
|
+
logFile: logger?.filePath
|
|
1135
1205
|
}
|
|
1136
1206
|
};
|
|
1137
1207
|
} finally {
|
|
1208
|
+
await logger?.close();
|
|
1138
1209
|
await this.cleanupWorkspace(workspaceRoot);
|
|
1139
1210
|
}
|
|
1140
1211
|
}
|
|
@@ -1161,7 +1232,7 @@ var CodexProvider = class {
|
|
|
1161
1232
|
args.push("-");
|
|
1162
1233
|
return args;
|
|
1163
1234
|
}
|
|
1164
|
-
async executeCodex(args, cwd, promptContent, signal) {
|
|
1235
|
+
async executeCodex(args, cwd, promptContent, signal, logger) {
|
|
1165
1236
|
try {
|
|
1166
1237
|
return await this.runCodex({
|
|
1167
1238
|
executable: this.resolvedExecutable ?? this.config.executable,
|
|
@@ -1170,7 +1241,9 @@ var CodexProvider = class {
|
|
|
1170
1241
|
prompt: promptContent,
|
|
1171
1242
|
timeoutMs: this.config.timeoutMs,
|
|
1172
1243
|
env: process.env,
|
|
1173
|
-
signal
|
|
1244
|
+
signal,
|
|
1245
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
1246
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
1174
1247
|
});
|
|
1175
1248
|
} catch (error) {
|
|
1176
1249
|
const err = error;
|
|
@@ -1222,7 +1295,240 @@ var CodexProvider = class {
|
|
|
1222
1295
|
} catch {
|
|
1223
1296
|
}
|
|
1224
1297
|
}
|
|
1298
|
+
resolveLogDirectory() {
|
|
1299
|
+
const disabled = isCodexLogStreamingDisabled();
|
|
1300
|
+
if (disabled) {
|
|
1301
|
+
return void 0;
|
|
1302
|
+
}
|
|
1303
|
+
if (this.config.logDir) {
|
|
1304
|
+
return path4.resolve(this.config.logDir);
|
|
1305
|
+
}
|
|
1306
|
+
return path4.join(process.cwd(), ".agentv", "logs", "codex");
|
|
1307
|
+
}
|
|
1308
|
+
async createStreamLogger(request) {
|
|
1309
|
+
const logDir = this.resolveLogDirectory();
|
|
1310
|
+
if (!logDir) {
|
|
1311
|
+
return void 0;
|
|
1312
|
+
}
|
|
1313
|
+
try {
|
|
1314
|
+
await mkdir(logDir, { recursive: true });
|
|
1315
|
+
} catch (error) {
|
|
1316
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1317
|
+
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
1318
|
+
return void 0;
|
|
1319
|
+
}
|
|
1320
|
+
const filePath = path4.join(logDir, buildLogFilename(request, this.targetName));
|
|
1321
|
+
try {
|
|
1322
|
+
const logger = await CodexStreamLogger.create({
|
|
1323
|
+
filePath,
|
|
1324
|
+
targetName: this.targetName,
|
|
1325
|
+
evalCaseId: request.evalCaseId,
|
|
1326
|
+
attempt: request.attempt,
|
|
1327
|
+
format: this.config.logFormat ?? "summary"
|
|
1328
|
+
});
|
|
1329
|
+
recordCodexLogEntry({
|
|
1330
|
+
filePath,
|
|
1331
|
+
targetName: this.targetName,
|
|
1332
|
+
evalCaseId: request.evalCaseId,
|
|
1333
|
+
attempt: request.attempt
|
|
1334
|
+
});
|
|
1335
|
+
return logger;
|
|
1336
|
+
} catch (error) {
|
|
1337
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1338
|
+
console.warn(`Skipping Codex stream logging for ${filePath}: ${message}`);
|
|
1339
|
+
return void 0;
|
|
1340
|
+
}
|
|
1341
|
+
}
|
|
1225
1342
|
};
|
|
1343
|
+
var CodexStreamLogger = class _CodexStreamLogger {
|
|
1344
|
+
filePath;
|
|
1345
|
+
stream;
|
|
1346
|
+
startedAt = Date.now();
|
|
1347
|
+
stdoutBuffer = "";
|
|
1348
|
+
stderrBuffer = "";
|
|
1349
|
+
format;
|
|
1350
|
+
constructor(filePath, format) {
|
|
1351
|
+
this.filePath = filePath;
|
|
1352
|
+
this.format = format;
|
|
1353
|
+
this.stream = createWriteStream(filePath, { flags: "a" });
|
|
1354
|
+
}
|
|
1355
|
+
static async create(options) {
|
|
1356
|
+
const logger = new _CodexStreamLogger(options.filePath, options.format);
|
|
1357
|
+
const header = [
|
|
1358
|
+
"# Codex CLI stream log",
|
|
1359
|
+
`# target: ${options.targetName}`,
|
|
1360
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
1361
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
1362
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
1363
|
+
""
|
|
1364
|
+
].filter((line) => Boolean(line));
|
|
1365
|
+
logger.writeLines(header);
|
|
1366
|
+
return logger;
|
|
1367
|
+
}
|
|
1368
|
+
handleStdoutChunk(chunk) {
|
|
1369
|
+
this.stdoutBuffer += chunk;
|
|
1370
|
+
this.flushBuffer("stdout");
|
|
1371
|
+
}
|
|
1372
|
+
handleStderrChunk(chunk) {
|
|
1373
|
+
this.stderrBuffer += chunk;
|
|
1374
|
+
this.flushBuffer("stderr");
|
|
1375
|
+
}
|
|
1376
|
+
async close() {
|
|
1377
|
+
this.flushBuffer("stdout");
|
|
1378
|
+
this.flushBuffer("stderr");
|
|
1379
|
+
this.flushRemainder();
|
|
1380
|
+
await new Promise((resolve, reject) => {
|
|
1381
|
+
this.stream.once("error", reject);
|
|
1382
|
+
this.stream.end(() => resolve());
|
|
1383
|
+
});
|
|
1384
|
+
}
|
|
1385
|
+
writeLines(lines) {
|
|
1386
|
+
for (const line of lines) {
|
|
1387
|
+
this.stream.write(`${line}
|
|
1388
|
+
`);
|
|
1389
|
+
}
|
|
1390
|
+
}
|
|
1391
|
+
flushBuffer(source) {
|
|
1392
|
+
const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
1393
|
+
const lines = buffer.split(/\r?\n/);
|
|
1394
|
+
const remainder = lines.pop() ?? "";
|
|
1395
|
+
if (source === "stdout") {
|
|
1396
|
+
this.stdoutBuffer = remainder;
|
|
1397
|
+
} else {
|
|
1398
|
+
this.stderrBuffer = remainder;
|
|
1399
|
+
}
|
|
1400
|
+
for (const line of lines) {
|
|
1401
|
+
const formatted = this.formatLine(line, source);
|
|
1402
|
+
if (formatted) {
|
|
1403
|
+
this.stream.write(formatted);
|
|
1404
|
+
this.stream.write("\n");
|
|
1405
|
+
}
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
formatLine(rawLine, source) {
|
|
1409
|
+
const trimmed = rawLine.trim();
|
|
1410
|
+
if (trimmed.length === 0) {
|
|
1411
|
+
return void 0;
|
|
1412
|
+
}
|
|
1413
|
+
const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
|
|
1414
|
+
return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
|
|
1415
|
+
}
|
|
1416
|
+
flushRemainder() {
|
|
1417
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
1418
|
+
if (stdoutRemainder.length > 0) {
|
|
1419
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
1420
|
+
if (formatted) {
|
|
1421
|
+
this.stream.write(formatted);
|
|
1422
|
+
this.stream.write("\n");
|
|
1423
|
+
}
|
|
1424
|
+
}
|
|
1425
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
1426
|
+
if (stderrRemainder.length > 0) {
|
|
1427
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
1428
|
+
if (formatted) {
|
|
1429
|
+
this.stream.write(formatted);
|
|
1430
|
+
this.stream.write("\n");
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
this.stdoutBuffer = "";
|
|
1434
|
+
this.stderrBuffer = "";
|
|
1435
|
+
}
|
|
1436
|
+
};
|
|
1437
|
+
function isCodexLogStreamingDisabled() {
|
|
1438
|
+
const envValue = process.env.AGENTV_CODEX_STREAM_LOGS;
|
|
1439
|
+
if (!envValue) {
|
|
1440
|
+
return false;
|
|
1441
|
+
}
|
|
1442
|
+
const normalized = envValue.trim().toLowerCase();
|
|
1443
|
+
return normalized === "false" || normalized === "0" || normalized === "off";
|
|
1444
|
+
}
|
|
1445
|
+
function buildLogFilename(request, targetName) {
|
|
1446
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1447
|
+
const evalId = sanitizeForFilename(request.evalCaseId ?? "codex");
|
|
1448
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
1449
|
+
const target = sanitizeForFilename(targetName);
|
|
1450
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`;
|
|
1451
|
+
}
|
|
1452
|
+
function sanitizeForFilename(value) {
|
|
1453
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
1454
|
+
return sanitized.length > 0 ? sanitized : "codex";
|
|
1455
|
+
}
|
|
1456
|
+
function formatElapsed(startedAt) {
|
|
1457
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
1458
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
1459
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
1460
|
+
const seconds = elapsedSeconds % 60;
|
|
1461
|
+
if (hours > 0) {
|
|
1462
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
1463
|
+
}
|
|
1464
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
1465
|
+
}
|
|
1466
|
+
function formatCodexLogMessage(rawLine, source) {
|
|
1467
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
1468
|
+
if (parsed) {
|
|
1469
|
+
const summary = summarizeCodexEvent(parsed);
|
|
1470
|
+
if (summary) {
|
|
1471
|
+
return summary;
|
|
1472
|
+
}
|
|
1473
|
+
}
|
|
1474
|
+
if (source === "stderr") {
|
|
1475
|
+
return `stderr: ${rawLine}`;
|
|
1476
|
+
}
|
|
1477
|
+
return rawLine;
|
|
1478
|
+
}
|
|
1479
|
+
function formatCodexJsonLog(rawLine) {
|
|
1480
|
+
const parsed = tryParseJsonValue(rawLine);
|
|
1481
|
+
if (!parsed) {
|
|
1482
|
+
return rawLine;
|
|
1483
|
+
}
|
|
1484
|
+
try {
|
|
1485
|
+
return JSON.stringify(parsed, null, 2);
|
|
1486
|
+
} catch {
|
|
1487
|
+
return rawLine;
|
|
1488
|
+
}
|
|
1489
|
+
}
|
|
1490
|
+
function summarizeCodexEvent(event) {
|
|
1491
|
+
if (!event || typeof event !== "object") {
|
|
1492
|
+
return void 0;
|
|
1493
|
+
}
|
|
1494
|
+
const record = event;
|
|
1495
|
+
const type = typeof record.type === "string" ? record.type : void 0;
|
|
1496
|
+
let message = extractFromEvent(event) ?? extractFromItem(record.item) ?? flattenContent(record.output ?? record.content);
|
|
1497
|
+
if (!message && type === JSONL_TYPE_ITEM_COMPLETED) {
|
|
1498
|
+
const item = record.item;
|
|
1499
|
+
if (item && typeof item === "object") {
|
|
1500
|
+
const candidate = flattenContent(
|
|
1501
|
+
item.text ?? item.content ?? item.output
|
|
1502
|
+
);
|
|
1503
|
+
if (candidate) {
|
|
1504
|
+
message = candidate;
|
|
1505
|
+
}
|
|
1506
|
+
}
|
|
1507
|
+
}
|
|
1508
|
+
if (!message) {
|
|
1509
|
+
const itemType = typeof record.item?.type === "string" ? record.item.type : void 0;
|
|
1510
|
+
if (type && itemType) {
|
|
1511
|
+
return `${type}:${itemType}`;
|
|
1512
|
+
}
|
|
1513
|
+
if (type) {
|
|
1514
|
+
return type;
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
if (type && message) {
|
|
1518
|
+
return `${type}: ${message}`;
|
|
1519
|
+
}
|
|
1520
|
+
if (message) {
|
|
1521
|
+
return message;
|
|
1522
|
+
}
|
|
1523
|
+
return type;
|
|
1524
|
+
}
|
|
1525
|
+
function tryParseJsonValue(rawLine) {
|
|
1526
|
+
try {
|
|
1527
|
+
return JSON.parse(rawLine);
|
|
1528
|
+
} catch {
|
|
1529
|
+
return void 0;
|
|
1530
|
+
}
|
|
1531
|
+
}
|
|
1226
1532
|
async function locateExecutable(candidate) {
|
|
1227
1533
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1228
1534
|
if (includesPathSeparator) {
|
|
@@ -1492,10 +1798,12 @@ async function defaultCodexRunner(options) {
|
|
|
1492
1798
|
child.stdout.setEncoding("utf8");
|
|
1493
1799
|
child.stdout.on("data", (chunk) => {
|
|
1494
1800
|
stdout += chunk;
|
|
1801
|
+
options.onStdoutChunk?.(chunk);
|
|
1495
1802
|
});
|
|
1496
1803
|
child.stderr.setEncoding("utf8");
|
|
1497
1804
|
child.stderr.on("data", (chunk) => {
|
|
1498
1805
|
stderr += chunk;
|
|
1806
|
+
options.onStderrChunk?.(chunk);
|
|
1499
1807
|
});
|
|
1500
1808
|
child.stdin.end(options.prompt);
|
|
1501
1809
|
const cleanup = () => {
|
|
@@ -1740,6 +2048,8 @@ function resolveCodexConfig(target, env) {
|
|
|
1740
2048
|
const argsSource = settings.args ?? settings.arguments;
|
|
1741
2049
|
const cwdSource = settings.cwd;
|
|
1742
2050
|
const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
|
|
2051
|
+
const logDirSource = settings.log_dir ?? settings.logDir ?? settings.log_directory ?? settings.logDirectory;
|
|
2052
|
+
const logFormatSource = settings.log_format ?? settings.logFormat ?? settings.log_output_format ?? settings.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
1743
2053
|
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
1744
2054
|
allowLiteral: true,
|
|
1745
2055
|
optionalEnv: true
|
|
@@ -1750,13 +2060,33 @@ function resolveCodexConfig(target, env) {
|
|
|
1750
2060
|
optionalEnv: true
|
|
1751
2061
|
});
|
|
1752
2062
|
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
|
|
2063
|
+
const logDir = resolveOptionalString(logDirSource, env, `${target.name} codex log directory`, {
|
|
2064
|
+
allowLiteral: true,
|
|
2065
|
+
optionalEnv: true
|
|
2066
|
+
});
|
|
2067
|
+
const logFormat = normalizeCodexLogFormat(logFormatSource);
|
|
1753
2068
|
return {
|
|
1754
2069
|
executable,
|
|
1755
2070
|
args,
|
|
1756
2071
|
cwd,
|
|
1757
|
-
timeoutMs
|
|
2072
|
+
timeoutMs,
|
|
2073
|
+
logDir,
|
|
2074
|
+
logFormat
|
|
1758
2075
|
};
|
|
1759
2076
|
}
|
|
2077
|
+
function normalizeCodexLogFormat(value) {
|
|
2078
|
+
if (value === void 0 || value === null) {
|
|
2079
|
+
return void 0;
|
|
2080
|
+
}
|
|
2081
|
+
if (typeof value !== "string") {
|
|
2082
|
+
throw new Error("codex log format must be 'summary' or 'json'");
|
|
2083
|
+
}
|
|
2084
|
+
const normalized = value.trim().toLowerCase();
|
|
2085
|
+
if (normalized === "json" || normalized === "summary") {
|
|
2086
|
+
return normalized;
|
|
2087
|
+
}
|
|
2088
|
+
throw new Error("codex log format must be 'summary' or 'json'");
|
|
2089
|
+
}
|
|
1760
2090
|
function resolveMockConfig(target) {
|
|
1761
2091
|
const settings = target.settings ?? {};
|
|
1762
2092
|
const response = typeof settings.response === "string" ? settings.response : void 0;
|
|
@@ -2386,7 +2716,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2386
2716
|
}
|
|
2387
2717
|
|
|
2388
2718
|
// src/evaluation/evaluators.ts
|
|
2389
|
-
import {
|
|
2719
|
+
import { ax, f } from "@ax-llm/ax";
|
|
2720
|
+
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
2721
|
+
var LLM_JUDGE_SIGNATURE = f().input(
|
|
2722
|
+
"evaluationContext",
|
|
2723
|
+
f.object(
|
|
2724
|
+
{
|
|
2725
|
+
expectedOutcome: f.string("The expected outcome for the original task"),
|
|
2726
|
+
request: f.string("The original task request"),
|
|
2727
|
+
referenceAnswer: f.string("The gold standard reference answer"),
|
|
2728
|
+
generatedAnswer: f.string("The answer to evaluate"),
|
|
2729
|
+
guidelines: f.string("Additional evaluation guidelines or instructions").optional()
|
|
2730
|
+
},
|
|
2731
|
+
"Complete evaluation context for the judge"
|
|
2732
|
+
)
|
|
2733
|
+
).output(
|
|
2734
|
+
"evaluation",
|
|
2735
|
+
f.object({
|
|
2736
|
+
score: f.number("Score between 0.0 and 1.0").min(0).max(1),
|
|
2737
|
+
hits: f.string("Brief specific achievement").array(),
|
|
2738
|
+
misses: f.string("Brief specific failure or omission").array(),
|
|
2739
|
+
reasoning: f.string("Concise explanation for the score").max(500)
|
|
2740
|
+
})
|
|
2741
|
+
).build();
|
|
2742
|
+
var LLM_JUDGE = ax(LLM_JUDGE_SIGNATURE);
|
|
2390
2743
|
var LlmJudgeEvaluator = class {
|
|
2391
2744
|
kind = "llm_judge";
|
|
2392
2745
|
resolveJudgeProvider;
|
|
@@ -2404,6 +2757,44 @@ var LlmJudgeEvaluator = class {
|
|
|
2404
2757
|
if (!judgeProvider) {
|
|
2405
2758
|
throw new Error("No judge provider available for LLM grading");
|
|
2406
2759
|
}
|
|
2760
|
+
if (providerSupportsAx(judgeProvider)) {
|
|
2761
|
+
return this.evaluateWithAx(context, judgeProvider);
|
|
2762
|
+
}
|
|
2763
|
+
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2764
|
+
}
|
|
2765
|
+
async evaluateWithAx(context, judgeProvider) {
|
|
2766
|
+
const ai = judgeProvider.getAxAI();
|
|
2767
|
+
const guidelines = context.promptInputs.guidelines?.trim();
|
|
2768
|
+
const evaluationContext = {
|
|
2769
|
+
expectedOutcome: context.evalCase.outcome.trim(),
|
|
2770
|
+
request: context.evalCase.task.trim(),
|
|
2771
|
+
referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
|
|
2772
|
+
generatedAnswer: context.candidate.trim(),
|
|
2773
|
+
...guidelines ? { guidelines } : {}
|
|
2774
|
+
};
|
|
2775
|
+
const options = this.buildJudgeForwardOptions(context);
|
|
2776
|
+
const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
|
|
2777
|
+
const evaluation = result.evaluation;
|
|
2778
|
+
const expectedAspectCount = Math.max(
|
|
2779
|
+
evaluation.hits.length + evaluation.misses.length,
|
|
2780
|
+
1
|
|
2781
|
+
);
|
|
2782
|
+
return {
|
|
2783
|
+
score: evaluation.score,
|
|
2784
|
+
hits: evaluation.hits,
|
|
2785
|
+
misses: evaluation.misses,
|
|
2786
|
+
expectedAspectCount,
|
|
2787
|
+
reasoning: evaluation.reasoning,
|
|
2788
|
+
evaluatorRawRequest: {
|
|
2789
|
+
id: randomUUID2(),
|
|
2790
|
+
provider: judgeProvider.id,
|
|
2791
|
+
target: context.target.name,
|
|
2792
|
+
method: "ax-structured-output",
|
|
2793
|
+
signature: LLM_JUDGE_SIGNATURE.toString()
|
|
2794
|
+
}
|
|
2795
|
+
};
|
|
2796
|
+
}
|
|
2797
|
+
async evaluateWithPrompt(context, judgeProvider) {
|
|
2407
2798
|
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2408
2799
|
const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2409
2800
|
const metadata = {
|
|
@@ -2423,8 +2814,9 @@ var LlmJudgeEvaluator = class {
|
|
|
2423
2814
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2424
2815
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2425
2816
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
2817
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
2426
2818
|
const evaluatorRawRequest = {
|
|
2427
|
-
id:
|
|
2819
|
+
id: randomUUID2(),
|
|
2428
2820
|
provider: judgeProvider.id,
|
|
2429
2821
|
prompt,
|
|
2430
2822
|
target: context.target.name,
|
|
@@ -2435,12 +2827,34 @@ var LlmJudgeEvaluator = class {
|
|
|
2435
2827
|
score,
|
|
2436
2828
|
hits,
|
|
2437
2829
|
misses,
|
|
2438
|
-
expectedAspectCount
|
|
2830
|
+
expectedAspectCount,
|
|
2439
2831
|
reasoning,
|
|
2440
2832
|
evaluatorRawRequest
|
|
2441
2833
|
};
|
|
2442
2834
|
}
|
|
2835
|
+
buildJudgeForwardOptions(context) {
|
|
2836
|
+
const modelConfig = this.buildJudgeModelConfig();
|
|
2837
|
+
if (modelConfig === void 0 && context.judgeModel === void 0) {
|
|
2838
|
+
return void 0;
|
|
2839
|
+
}
|
|
2840
|
+
return {
|
|
2841
|
+
...context.judgeModel ? { model: context.judgeModel } : {},
|
|
2842
|
+
...modelConfig ? { modelConfig } : {}
|
|
2843
|
+
};
|
|
2844
|
+
}
|
|
2845
|
+
buildJudgeModelConfig() {
|
|
2846
|
+
if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
|
|
2847
|
+
return void 0;
|
|
2848
|
+
}
|
|
2849
|
+
return {
|
|
2850
|
+
...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
|
|
2851
|
+
...this.temperature !== void 0 ? { temperature: this.temperature } : {}
|
|
2852
|
+
};
|
|
2853
|
+
}
|
|
2443
2854
|
};
|
|
2855
|
+
function providerSupportsAx(provider) {
|
|
2856
|
+
return typeof provider.getAxAI === "function";
|
|
2857
|
+
}
|
|
2444
2858
|
var QUALITY_SYSTEM_PROMPT = [
|
|
2445
2859
|
"You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2446
2860
|
"",
|
|
@@ -2663,8 +3077,8 @@ function parseJsonSafe(payload) {
|
|
|
2663
3077
|
}
|
|
2664
3078
|
|
|
2665
3079
|
// src/evaluation/orchestrator.ts
|
|
2666
|
-
import { createHash, randomUUID as
|
|
2667
|
-
import { mkdir as mkdir2,
|
|
3080
|
+
import { createHash, randomUUID as randomUUID3 } from "node:crypto";
|
|
3081
|
+
import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
|
|
2668
3082
|
import path7 from "node:path";
|
|
2669
3083
|
|
|
2670
3084
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
@@ -3211,6 +3625,7 @@ async function evaluateCandidate(options) {
|
|
|
3211
3625
|
};
|
|
3212
3626
|
return {
|
|
3213
3627
|
eval_id: evalCase.id,
|
|
3628
|
+
dataset: evalCase.dataset,
|
|
3214
3629
|
conversation_id: evalCase.conversation_id,
|
|
3215
3630
|
score: score.score,
|
|
3216
3631
|
hits: score.hits,
|
|
@@ -3387,7 +3802,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3387
3802
|
async function resolveCustomPrompt(config) {
|
|
3388
3803
|
if (config.promptPath) {
|
|
3389
3804
|
try {
|
|
3390
|
-
return await
|
|
3805
|
+
return await readTextFile(config.promptPath);
|
|
3391
3806
|
} catch (error) {
|
|
3392
3807
|
const message = error instanceof Error ? error.message : String(error);
|
|
3393
3808
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
@@ -3436,7 +3851,7 @@ function sanitizeFilename(value) {
|
|
|
3436
3851
|
return "prompt";
|
|
3437
3852
|
}
|
|
3438
3853
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3439
|
-
return sanitized.length > 0 ? sanitized :
|
|
3854
|
+
return sanitized.length > 0 ? sanitized : randomUUID3();
|
|
3440
3855
|
}
|
|
3441
3856
|
async function invokeProvider(provider, options) {
|
|
3442
3857
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -3475,6 +3890,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
3475
3890
|
};
|
|
3476
3891
|
return {
|
|
3477
3892
|
eval_id: evalCase.id,
|
|
3893
|
+
dataset: evalCase.dataset,
|
|
3478
3894
|
conversation_id: evalCase.conversation_id,
|
|
3479
3895
|
score: 0,
|
|
3480
3896
|
hits: [],
|
|
@@ -3524,6 +3940,7 @@ export {
|
|
|
3524
3940
|
buildDirectoryChain,
|
|
3525
3941
|
buildPromptInputs,
|
|
3526
3942
|
buildSearchRoots,
|
|
3943
|
+
consumeCodexLogEntries,
|
|
3527
3944
|
createAgentKernel,
|
|
3528
3945
|
createProvider,
|
|
3529
3946
|
ensureVSCodeSubagents,
|
|
@@ -3540,10 +3957,12 @@ export {
|
|
|
3540
3957
|
listTargetNames,
|
|
3541
3958
|
loadEvalCases,
|
|
3542
3959
|
readTargetDefinitions,
|
|
3960
|
+
readTextFile,
|
|
3543
3961
|
resolveAndCreateProvider,
|
|
3544
3962
|
resolveFileReference,
|
|
3545
3963
|
resolveTargetDefinition,
|
|
3546
3964
|
runEvalCase,
|
|
3547
|
-
runEvaluation
|
|
3965
|
+
runEvaluation,
|
|
3966
|
+
subscribeToCodexLogEntries
|
|
3548
3967
|
};
|
|
3549
3968
|
//# sourceMappingURL=index.js.map
|