@agentv/core 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-OW3SHBIJ.js → chunk-L7I5UTJU.js} +1 -1
- package/dist/{chunk-OW3SHBIJ.js.map → chunk-L7I5UTJU.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +221 -242
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -13
- package/dist/index.d.ts +11 -13
- package/dist/index.js +222 -243
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -299,6 +299,87 @@ function extractCodeBlocks(segments) {
|
|
|
299
299
|
}
|
|
300
300
|
return codeBlocks;
|
|
301
301
|
}
|
|
302
|
+
async function processMessages(options) {
|
|
303
|
+
const {
|
|
304
|
+
messages,
|
|
305
|
+
searchRoots,
|
|
306
|
+
repoRootPath,
|
|
307
|
+
guidelinePatterns,
|
|
308
|
+
guidelinePaths,
|
|
309
|
+
textParts,
|
|
310
|
+
messageType,
|
|
311
|
+
verbose
|
|
312
|
+
} = options;
|
|
313
|
+
const segments = [];
|
|
314
|
+
for (const message of messages) {
|
|
315
|
+
const content = message.content;
|
|
316
|
+
if (typeof content === "string") {
|
|
317
|
+
segments.push({ type: "text", value: content });
|
|
318
|
+
if (textParts) {
|
|
319
|
+
textParts.push(content);
|
|
320
|
+
}
|
|
321
|
+
continue;
|
|
322
|
+
}
|
|
323
|
+
for (const rawSegment of content) {
|
|
324
|
+
if (!isJsonObject(rawSegment)) {
|
|
325
|
+
continue;
|
|
326
|
+
}
|
|
327
|
+
const segmentType = asString(rawSegment.type);
|
|
328
|
+
if (segmentType === "file") {
|
|
329
|
+
const rawValue = asString(rawSegment.value);
|
|
330
|
+
if (!rawValue) {
|
|
331
|
+
continue;
|
|
332
|
+
}
|
|
333
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
334
|
+
rawValue,
|
|
335
|
+
searchRoots
|
|
336
|
+
);
|
|
337
|
+
if (!resolvedPath) {
|
|
338
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
339
|
+
const context = messageType === "input" ? "" : " in expected_messages";
|
|
340
|
+
logWarning(`File not found${context}: ${displayPath}`, attempts);
|
|
341
|
+
continue;
|
|
342
|
+
}
|
|
343
|
+
try {
|
|
344
|
+
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
345
|
+
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
346
|
+
const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
|
|
347
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
348
|
+
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
349
|
+
if (verbose) {
|
|
350
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
351
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
352
|
+
}
|
|
353
|
+
continue;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
segments.push({
|
|
357
|
+
type: "file",
|
|
358
|
+
path: displayPath,
|
|
359
|
+
text: fileContent,
|
|
360
|
+
resolvedPath: import_node_path2.default.resolve(resolvedPath)
|
|
361
|
+
});
|
|
362
|
+
if (verbose) {
|
|
363
|
+
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
364
|
+
console.log(` ${label} Found: ${displayPath}`);
|
|
365
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
366
|
+
}
|
|
367
|
+
} catch (error) {
|
|
368
|
+
const context = messageType === "input" ? "" : " expected output";
|
|
369
|
+
logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
370
|
+
}
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
const clonedSegment = cloneJsonObject(rawSegment);
|
|
374
|
+
segments.push(clonedSegment);
|
|
375
|
+
const inlineValue = clonedSegment.value;
|
|
376
|
+
if (typeof inlineValue === "string" && textParts) {
|
|
377
|
+
textParts.push(inlineValue);
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
return segments;
|
|
382
|
+
}
|
|
302
383
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
303
384
|
const verbose = options?.verbose ?? false;
|
|
304
385
|
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
@@ -384,77 +465,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
384
465
|
}
|
|
385
466
|
}
|
|
386
467
|
}
|
|
387
|
-
const userSegments = [];
|
|
388
468
|
const guidelinePaths = [];
|
|
389
|
-
const
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
searchRoots
|
|
410
|
-
);
|
|
411
|
-
if (!resolvedPath) {
|
|
412
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
413
|
-
logWarning(`File not found: ${displayPath}`, attempts);
|
|
414
|
-
continue;
|
|
415
|
-
}
|
|
416
|
-
try {
|
|
417
|
-
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
418
|
-
const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
|
|
419
|
-
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
420
|
-
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
421
|
-
if (verbose) {
|
|
422
|
-
console.log(` [Guideline] Found: ${displayPath}`);
|
|
423
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
424
|
-
}
|
|
425
|
-
} else {
|
|
426
|
-
userSegments.push({
|
|
427
|
-
type: "file",
|
|
428
|
-
path: displayPath,
|
|
429
|
-
text: fileContent,
|
|
430
|
-
resolvedPath: import_node_path2.default.resolve(resolvedPath)
|
|
431
|
-
});
|
|
432
|
-
if (verbose) {
|
|
433
|
-
console.log(` [File] Found: ${displayPath}`);
|
|
434
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
} catch (error) {
|
|
438
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
439
|
-
}
|
|
440
|
-
continue;
|
|
441
|
-
}
|
|
442
|
-
const clonedSegment = cloneJsonObject(rawSegment);
|
|
443
|
-
userSegments.push(clonedSegment);
|
|
444
|
-
const inlineValue = clonedSegment.value;
|
|
445
|
-
if (typeof inlineValue === "string") {
|
|
446
|
-
userTextParts.push(inlineValue);
|
|
447
|
-
}
|
|
448
|
-
}
|
|
449
|
-
}
|
|
450
|
-
const codeSnippets = extractCodeBlocks(userSegments);
|
|
469
|
+
const inputTextParts = [];
|
|
470
|
+
const inputSegments = await processMessages({
|
|
471
|
+
messages: userMessages,
|
|
472
|
+
searchRoots,
|
|
473
|
+
repoRootPath,
|
|
474
|
+
guidelinePatterns,
|
|
475
|
+
guidelinePaths,
|
|
476
|
+
textParts: inputTextParts,
|
|
477
|
+
messageType: "input",
|
|
478
|
+
verbose
|
|
479
|
+
});
|
|
480
|
+
const outputSegments = await processMessages({
|
|
481
|
+
messages: assistantMessages,
|
|
482
|
+
searchRoots,
|
|
483
|
+
repoRootPath,
|
|
484
|
+
guidelinePatterns,
|
|
485
|
+
messageType: "output",
|
|
486
|
+
verbose
|
|
487
|
+
});
|
|
488
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
451
489
|
const assistantContent = assistantMessages[0]?.content;
|
|
452
|
-
const
|
|
453
|
-
const
|
|
490
|
+
const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
491
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
454
492
|
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
455
493
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
456
494
|
const userFilePaths = [];
|
|
457
|
-
for (const segment of
|
|
495
|
+
for (const segment of inputSegments) {
|
|
458
496
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
459
497
|
userFilePaths.push(segment.resolvedPath);
|
|
460
498
|
}
|
|
@@ -467,15 +505,16 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
467
505
|
id,
|
|
468
506
|
dataset: datasetName,
|
|
469
507
|
conversation_id: conversationId,
|
|
470
|
-
|
|
471
|
-
|
|
508
|
+
question,
|
|
509
|
+
input_segments: inputSegments,
|
|
510
|
+
output_segments: outputSegments,
|
|
472
511
|
system_message: systemMessageContent,
|
|
473
|
-
|
|
512
|
+
reference_answer: referenceAnswer,
|
|
474
513
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
475
514
|
guideline_patterns: guidelinePatterns,
|
|
476
515
|
file_paths: allFilePaths,
|
|
477
516
|
code_snippets: codeSnippets,
|
|
478
|
-
outcome,
|
|
517
|
+
expected_outcome: outcome,
|
|
479
518
|
evaluator: testCaseEvaluatorKind,
|
|
480
519
|
evaluators
|
|
481
520
|
};
|
|
@@ -511,36 +550,36 @@ ${content}`);
|
|
|
511
550
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
512
551
|
}
|
|
513
552
|
}
|
|
514
|
-
const
|
|
515
|
-
for (const segment of testCase.
|
|
553
|
+
const questionParts = [];
|
|
554
|
+
for (const segment of testCase.input_segments) {
|
|
516
555
|
const typeValue = segment.type;
|
|
517
556
|
if (typeof typeValue === "string" && typeValue === "file") {
|
|
518
557
|
const pathValue = segment.path;
|
|
519
558
|
const textValue = segment.text;
|
|
520
559
|
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
521
560
|
const body = typeof textValue === "string" ? textValue : "";
|
|
522
|
-
|
|
561
|
+
questionParts.push(`=== ${label} ===
|
|
523
562
|
${body}`);
|
|
524
563
|
continue;
|
|
525
564
|
}
|
|
526
565
|
if (typeof typeValue === "string" && typeValue === "text") {
|
|
527
566
|
const value = segment.value;
|
|
528
567
|
if (typeof value === "string") {
|
|
529
|
-
|
|
568
|
+
questionParts.push(value);
|
|
530
569
|
}
|
|
531
570
|
continue;
|
|
532
571
|
}
|
|
533
572
|
const genericValue = segment.value;
|
|
534
573
|
if (typeof genericValue === "string") {
|
|
535
|
-
|
|
574
|
+
questionParts.push(genericValue);
|
|
536
575
|
}
|
|
537
576
|
}
|
|
538
577
|
if (testCase.code_snippets.length > 0) {
|
|
539
|
-
|
|
578
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
540
579
|
}
|
|
541
|
-
const
|
|
580
|
+
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
542
581
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
543
|
-
return {
|
|
582
|
+
return { question, guidelines, systemMessage: testCase.system_message };
|
|
544
583
|
}
|
|
545
584
|
async function fileExists2(absolutePath) {
|
|
546
585
|
try {
|
|
@@ -752,7 +791,7 @@ function buildChatPrompt(request) {
|
|
|
752
791
|
${request.guidelines.trim()}`);
|
|
753
792
|
}
|
|
754
793
|
const systemContent = systemSegments.join("\n\n");
|
|
755
|
-
const userContent = request.
|
|
794
|
+
const userContent = request.question.trim();
|
|
756
795
|
const prompt = [
|
|
757
796
|
{
|
|
758
797
|
role: "system",
|
|
@@ -1050,7 +1089,7 @@ var CliProvider = class {
|
|
|
1050
1089
|
healthcheck.commandTemplate,
|
|
1051
1090
|
buildTemplateValues(
|
|
1052
1091
|
{
|
|
1053
|
-
|
|
1092
|
+
question: "",
|
|
1054
1093
|
guidelines: "",
|
|
1055
1094
|
inputFiles: [],
|
|
1056
1095
|
evalCaseId: "",
|
|
@@ -1077,7 +1116,7 @@ var CliProvider = class {
|
|
|
1077
1116
|
function buildTemplateValues(request, config) {
|
|
1078
1117
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
1079
1118
|
return {
|
|
1080
|
-
PROMPT: shellEscape(request.
|
|
1119
|
+
PROMPT: shellEscape(request.question ?? ""),
|
|
1081
1120
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
1082
1121
|
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
1083
1122
|
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
@@ -1141,6 +1180,59 @@ var import_node_os = require("os");
|
|
|
1141
1180
|
var import_node_path5 = __toESM(require("path"), 1);
|
|
1142
1181
|
var import_node_util2 = require("util");
|
|
1143
1182
|
|
|
1183
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1184
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1185
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1186
|
+
function getCodexLogStore() {
|
|
1187
|
+
const globalObject = globalThis;
|
|
1188
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1189
|
+
if (existing) {
|
|
1190
|
+
return existing;
|
|
1191
|
+
}
|
|
1192
|
+
const created = [];
|
|
1193
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1194
|
+
return created;
|
|
1195
|
+
}
|
|
1196
|
+
function getSubscriberStore() {
|
|
1197
|
+
const globalObject = globalThis;
|
|
1198
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1199
|
+
if (existing) {
|
|
1200
|
+
return existing;
|
|
1201
|
+
}
|
|
1202
|
+
const created = /* @__PURE__ */ new Set();
|
|
1203
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1204
|
+
return created;
|
|
1205
|
+
}
|
|
1206
|
+
function notifySubscribers(entry) {
|
|
1207
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1208
|
+
for (const listener of subscribers) {
|
|
1209
|
+
try {
|
|
1210
|
+
listener(entry);
|
|
1211
|
+
} catch (error) {
|
|
1212
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1213
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1214
|
+
}
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
function recordCodexLogEntry(entry) {
|
|
1218
|
+
getCodexLogStore().push(entry);
|
|
1219
|
+
notifySubscribers(entry);
|
|
1220
|
+
}
|
|
1221
|
+
function consumeCodexLogEntries() {
|
|
1222
|
+
const store = getCodexLogStore();
|
|
1223
|
+
if (store.length === 0) {
|
|
1224
|
+
return [];
|
|
1225
|
+
}
|
|
1226
|
+
return store.splice(0, store.length);
|
|
1227
|
+
}
|
|
1228
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1229
|
+
const store = getSubscriberStore();
|
|
1230
|
+
store.add(listener);
|
|
1231
|
+
return () => {
|
|
1232
|
+
store.delete(listener);
|
|
1233
|
+
};
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1144
1236
|
// src/evaluation/providers/preread.ts
|
|
1145
1237
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
1146
1238
|
function buildPromptDocument(request, inputFiles, options) {
|
|
@@ -1158,7 +1250,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
1158
1250
|
if (prereadBlock.length > 0) {
|
|
1159
1251
|
parts.push("\n", prereadBlock);
|
|
1160
1252
|
}
|
|
1161
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
1253
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
1162
1254
|
return parts.join("\n").trim();
|
|
1163
1255
|
}
|
|
1164
1256
|
function normalizeInputFiles2(inputFiles) {
|
|
@@ -1242,59 +1334,6 @@ function pathToFileUri(filePath) {
|
|
|
1242
1334
|
return `file://${normalizedPath}`;
|
|
1243
1335
|
}
|
|
1244
1336
|
|
|
1245
|
-
// src/evaluation/providers/codex-log-tracker.ts
|
|
1246
|
-
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1247
|
-
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1248
|
-
function getCodexLogStore() {
|
|
1249
|
-
const globalObject = globalThis;
|
|
1250
|
-
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1251
|
-
if (existing) {
|
|
1252
|
-
return existing;
|
|
1253
|
-
}
|
|
1254
|
-
const created = [];
|
|
1255
|
-
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1256
|
-
return created;
|
|
1257
|
-
}
|
|
1258
|
-
function getSubscriberStore() {
|
|
1259
|
-
const globalObject = globalThis;
|
|
1260
|
-
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1261
|
-
if (existing) {
|
|
1262
|
-
return existing;
|
|
1263
|
-
}
|
|
1264
|
-
const created = /* @__PURE__ */ new Set();
|
|
1265
|
-
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1266
|
-
return created;
|
|
1267
|
-
}
|
|
1268
|
-
function notifySubscribers(entry) {
|
|
1269
|
-
const subscribers = Array.from(getSubscriberStore());
|
|
1270
|
-
for (const listener of subscribers) {
|
|
1271
|
-
try {
|
|
1272
|
-
listener(entry);
|
|
1273
|
-
} catch (error) {
|
|
1274
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1275
|
-
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1276
|
-
}
|
|
1277
|
-
}
|
|
1278
|
-
}
|
|
1279
|
-
function recordCodexLogEntry(entry) {
|
|
1280
|
-
getCodexLogStore().push(entry);
|
|
1281
|
-
notifySubscribers(entry);
|
|
1282
|
-
}
|
|
1283
|
-
function consumeCodexLogEntries() {
|
|
1284
|
-
const store = getCodexLogStore();
|
|
1285
|
-
if (store.length === 0) {
|
|
1286
|
-
return [];
|
|
1287
|
-
}
|
|
1288
|
-
return store.splice(0, store.length);
|
|
1289
|
-
}
|
|
1290
|
-
function subscribeToCodexLogEntries(listener) {
|
|
1291
|
-
const store = getSubscriberStore();
|
|
1292
|
-
store.add(listener);
|
|
1293
|
-
return () => {
|
|
1294
|
-
store.delete(listener);
|
|
1295
|
-
};
|
|
1296
|
-
}
|
|
1297
|
-
|
|
1298
1337
|
// src/evaluation/providers/codex.ts
|
|
1299
1338
|
var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
|
|
1300
1339
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
@@ -2028,7 +2067,7 @@ var MockProvider = class {
|
|
|
2028
2067
|
return {
|
|
2029
2068
|
text: this.cannedResponse,
|
|
2030
2069
|
raw: {
|
|
2031
|
-
|
|
2070
|
+
question: request.question,
|
|
2032
2071
|
guidelines: request.guidelines
|
|
2033
2072
|
}
|
|
2034
2073
|
};
|
|
@@ -2644,7 +2683,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
2644
2683
|
if (prereadBlock.length > 0) {
|
|
2645
2684
|
parts.push("\n", prereadBlock);
|
|
2646
2685
|
}
|
|
2647
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
2686
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2648
2687
|
return parts.join("\n").trim();
|
|
2649
2688
|
}
|
|
2650
2689
|
function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
@@ -2886,30 +2925,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2886
2925
|
}
|
|
2887
2926
|
|
|
2888
2927
|
// src/evaluation/evaluators.ts
|
|
2889
|
-
var import_ax3 = require("@ax-llm/ax");
|
|
2890
2928
|
var import_node_crypto2 = require("crypto");
|
|
2891
|
-
var LLM_JUDGE_SIGNATURE = (0, import_ax3.f)().input(
|
|
2892
|
-
"evaluationContext",
|
|
2893
|
-
import_ax3.f.object(
|
|
2894
|
-
{
|
|
2895
|
-
expectedOutcome: import_ax3.f.string("The expected outcome for the original task"),
|
|
2896
|
-
request: import_ax3.f.string("The original task request"),
|
|
2897
|
-
referenceAnswer: import_ax3.f.string("The gold standard reference answer"),
|
|
2898
|
-
generatedAnswer: import_ax3.f.string("The answer to evaluate"),
|
|
2899
|
-
guidelines: import_ax3.f.string("Additional evaluation guidelines or instructions").optional()
|
|
2900
|
-
},
|
|
2901
|
-
"Complete evaluation context for the judge"
|
|
2902
|
-
)
|
|
2903
|
-
).output(
|
|
2904
|
-
"evaluation",
|
|
2905
|
-
import_ax3.f.object({
|
|
2906
|
-
score: import_ax3.f.number("Score between 0.0 and 1.0").min(0).max(1),
|
|
2907
|
-
hits: import_ax3.f.string("Brief specific achievement").array(),
|
|
2908
|
-
misses: import_ax3.f.string("Brief specific failure or omission").array(),
|
|
2909
|
-
reasoning: import_ax3.f.string("Concise explanation for the score").max(500)
|
|
2910
|
-
})
|
|
2911
|
-
).build();
|
|
2912
|
-
var LLM_JUDGE = (0, import_ax3.ax)(LLM_JUDGE_SIGNATURE);
|
|
2913
2929
|
var LlmJudgeEvaluator = class {
|
|
2914
2930
|
kind = "llm_judge";
|
|
2915
2931
|
resolveJudgeProvider;
|
|
@@ -2927,52 +2943,29 @@ var LlmJudgeEvaluator = class {
|
|
|
2927
2943
|
if (!judgeProvider) {
|
|
2928
2944
|
throw new Error("No judge provider available for LLM grading");
|
|
2929
2945
|
}
|
|
2930
|
-
if (providerSupportsAx(judgeProvider)) {
|
|
2931
|
-
return this.evaluateWithAx(context, judgeProvider);
|
|
2932
|
-
}
|
|
2933
2946
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2934
2947
|
}
|
|
2935
|
-
async evaluateWithAx(context, judgeProvider) {
|
|
2936
|
-
const ai = judgeProvider.getAxAI();
|
|
2937
|
-
const guidelines = context.promptInputs.guidelines?.trim();
|
|
2938
|
-
const evaluationContext = {
|
|
2939
|
-
expectedOutcome: context.evalCase.outcome.trim(),
|
|
2940
|
-
request: context.evalCase.task.trim(),
|
|
2941
|
-
referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
|
|
2942
|
-
generatedAnswer: context.candidate.trim(),
|
|
2943
|
-
...guidelines ? { guidelines } : {}
|
|
2944
|
-
};
|
|
2945
|
-
const options = this.buildJudgeForwardOptions(context);
|
|
2946
|
-
const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
|
|
2947
|
-
const evaluation = result.evaluation;
|
|
2948
|
-
const expectedAspectCount = Math.max(
|
|
2949
|
-
evaluation.hits.length + evaluation.misses.length,
|
|
2950
|
-
1
|
|
2951
|
-
);
|
|
2952
|
-
return {
|
|
2953
|
-
score: evaluation.score,
|
|
2954
|
-
hits: evaluation.hits,
|
|
2955
|
-
misses: evaluation.misses,
|
|
2956
|
-
expectedAspectCount,
|
|
2957
|
-
reasoning: evaluation.reasoning,
|
|
2958
|
-
evaluatorRawRequest: {
|
|
2959
|
-
id: (0, import_node_crypto2.randomUUID)(),
|
|
2960
|
-
provider: judgeProvider.id,
|
|
2961
|
-
target: context.target.name,
|
|
2962
|
-
method: "ax-structured-output",
|
|
2963
|
-
signature: LLM_JUDGE_SIGNATURE.toString()
|
|
2964
|
-
}
|
|
2965
|
-
};
|
|
2966
|
-
}
|
|
2967
2948
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2968
|
-
|
|
2969
|
-
|
|
2949
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2950
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2951
|
+
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2952
|
+
const variables = {
|
|
2953
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2954
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2955
|
+
candidate_answer: context.candidate,
|
|
2956
|
+
reference_answer: context.evalCase.reference_answer,
|
|
2957
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
2958
|
+
question: context.evalCase.question
|
|
2959
|
+
};
|
|
2960
|
+
prompt = substituteVariables(systemPrompt, variables);
|
|
2961
|
+
systemPrompt = QUALITY_SYSTEM_PROMPT;
|
|
2962
|
+
}
|
|
2970
2963
|
const metadata = {
|
|
2971
2964
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2972
2965
|
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2973
2966
|
};
|
|
2974
2967
|
const response = await judgeProvider.invoke({
|
|
2975
|
-
prompt,
|
|
2968
|
+
question: prompt,
|
|
2976
2969
|
metadata,
|
|
2977
2970
|
evalCaseId: context.evalCase.id,
|
|
2978
2971
|
attempt: context.attempt,
|
|
@@ -3002,33 +2995,11 @@ var LlmJudgeEvaluator = class {
|
|
|
3002
2995
|
evaluatorRawRequest
|
|
3003
2996
|
};
|
|
3004
2997
|
}
|
|
3005
|
-
buildJudgeForwardOptions(context) {
|
|
3006
|
-
const modelConfig = this.buildJudgeModelConfig();
|
|
3007
|
-
if (modelConfig === void 0 && context.judgeModel === void 0) {
|
|
3008
|
-
return void 0;
|
|
3009
|
-
}
|
|
3010
|
-
return {
|
|
3011
|
-
...context.judgeModel ? { model: context.judgeModel } : {},
|
|
3012
|
-
...modelConfig ? { modelConfig } : {}
|
|
3013
|
-
};
|
|
3014
|
-
}
|
|
3015
|
-
buildJudgeModelConfig() {
|
|
3016
|
-
if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
|
|
3017
|
-
return void 0;
|
|
3018
|
-
}
|
|
3019
|
-
return {
|
|
3020
|
-
...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
|
|
3021
|
-
...this.temperature !== void 0 ? { temperature: this.temperature } : {}
|
|
3022
|
-
};
|
|
3023
|
-
}
|
|
3024
2998
|
};
|
|
3025
|
-
function providerSupportsAx(provider) {
|
|
3026
|
-
return typeof provider.getAxAI === "function";
|
|
3027
|
-
}
|
|
3028
2999
|
var QUALITY_SYSTEM_PROMPT = [
|
|
3029
|
-
"You are an expert evaluator. Your goal is to grade the
|
|
3000
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
3030
3001
|
"",
|
|
3031
|
-
"Use the reference_answer as a gold standard for a high-quality response. The
|
|
3002
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
|
|
3032
3003
|
"",
|
|
3033
3004
|
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
3034
3005
|
"",
|
|
@@ -3041,18 +3012,18 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
3041
3012
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
3042
3013
|
"}"
|
|
3043
3014
|
].join("\n");
|
|
3044
|
-
function buildQualityPrompt(
|
|
3015
|
+
function buildQualityPrompt(evalCase, candidate) {
|
|
3045
3016
|
const parts = [
|
|
3046
3017
|
"[[ ## expected_outcome ## ]]",
|
|
3047
|
-
|
|
3018
|
+
evalCase.expected_outcome.trim(),
|
|
3048
3019
|
"",
|
|
3049
|
-
"[[ ##
|
|
3050
|
-
|
|
3020
|
+
"[[ ## question ## ]]",
|
|
3021
|
+
evalCase.question.trim(),
|
|
3051
3022
|
"",
|
|
3052
3023
|
"[[ ## reference_answer ## ]]",
|
|
3053
|
-
|
|
3024
|
+
evalCase.reference_answer.trim(),
|
|
3054
3025
|
"",
|
|
3055
|
-
"[[ ##
|
|
3026
|
+
"[[ ## candidate_answer ## ]]",
|
|
3056
3027
|
candidate.trim(),
|
|
3057
3028
|
"",
|
|
3058
3029
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
@@ -3152,14 +3123,14 @@ var CodeEvaluator = class {
|
|
|
3152
3123
|
async evaluate(context) {
|
|
3153
3124
|
const inputPayload = JSON.stringify(
|
|
3154
3125
|
{
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
|
|
3126
|
+
question: context.evalCase.question,
|
|
3127
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
3128
|
+
reference_answer: context.evalCase.reference_answer,
|
|
3129
|
+
candidate_answer: context.candidate,
|
|
3159
3130
|
system_message: context.promptInputs.systemMessage ?? "",
|
|
3160
3131
|
guideline_paths: context.evalCase.guideline_paths,
|
|
3161
|
-
|
|
3162
|
-
|
|
3132
|
+
input_files: context.evalCase.file_paths,
|
|
3133
|
+
input_segments: context.evalCase.input_segments
|
|
3163
3134
|
},
|
|
3164
3135
|
null,
|
|
3165
3136
|
2
|
|
@@ -3245,6 +3216,14 @@ function parseJsonSafe(payload) {
|
|
|
3245
3216
|
return void 0;
|
|
3246
3217
|
}
|
|
3247
3218
|
}
|
|
3219
|
+
function hasTemplateVariables(text) {
|
|
3220
|
+
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
3221
|
+
}
|
|
3222
|
+
function substituteVariables(template, variables) {
|
|
3223
|
+
return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
|
|
3224
|
+
return variables[varName] ?? match;
|
|
3225
|
+
});
|
|
3226
|
+
}
|
|
3248
3227
|
|
|
3249
3228
|
// src/evaluation/orchestrator.ts
|
|
3250
3229
|
var import_node_crypto3 = require("crypto");
|
|
@@ -3601,7 +3580,7 @@ async function runBatchEvaluation(options) {
|
|
|
3601
3580
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
3602
3581
|
const promptInputs = promptInputsList[index];
|
|
3603
3582
|
return {
|
|
3604
|
-
|
|
3583
|
+
question: promptInputs.question,
|
|
3605
3584
|
guidelines: promptInputs.guidelines,
|
|
3606
3585
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3607
3586
|
inputFiles: evalCase.file_paths,
|
|
@@ -3788,7 +3767,7 @@ async function evaluateCandidate(options) {
|
|
|
3788
3767
|
});
|
|
3789
3768
|
const completedAt = nowFn();
|
|
3790
3769
|
const rawRequest = {
|
|
3791
|
-
|
|
3770
|
+
question: promptInputs.question,
|
|
3792
3771
|
guidelines: promptInputs.guidelines,
|
|
3793
3772
|
guideline_paths: evalCase.guideline_paths,
|
|
3794
3773
|
system_message: promptInputs.systemMessage ?? ""
|
|
@@ -3800,7 +3779,7 @@ async function evaluateCandidate(options) {
|
|
|
3800
3779
|
score: score.score,
|
|
3801
3780
|
hits: score.hits,
|
|
3802
3781
|
misses: score.misses,
|
|
3803
|
-
|
|
3782
|
+
candidate_answer: candidate,
|
|
3804
3783
|
expected_aspect_count: score.expectedAspectCount,
|
|
3805
3784
|
target: target.name,
|
|
3806
3785
|
timestamp: completedAt.toISOString(),
|
|
@@ -4010,7 +3989,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
|
4010
3989
|
await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
|
|
4011
3990
|
const payload = {
|
|
4012
3991
|
eval_id: evalCase.id,
|
|
4013
|
-
|
|
3992
|
+
question: promptInputs.question,
|
|
4014
3993
|
guidelines: promptInputs.guidelines,
|
|
4015
3994
|
guideline_paths: evalCase.guideline_paths
|
|
4016
3995
|
};
|
|
@@ -4032,7 +4011,7 @@ async function invokeProvider(provider, options) {
|
|
|
4032
4011
|
}
|
|
4033
4012
|
try {
|
|
4034
4013
|
return await provider.invoke({
|
|
4035
|
-
|
|
4014
|
+
question: promptInputs.question,
|
|
4036
4015
|
guidelines: promptInputs.guidelines,
|
|
4037
4016
|
guideline_patterns: evalCase.guideline_patterns,
|
|
4038
4017
|
inputFiles: evalCase.file_paths,
|
|
@@ -4052,7 +4031,7 @@ async function invokeProvider(provider, options) {
|
|
|
4052
4031
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
4053
4032
|
const message = error instanceof Error ? error.message : String(error);
|
|
4054
4033
|
const rawRequest = {
|
|
4055
|
-
|
|
4034
|
+
question: promptInputs.question,
|
|
4056
4035
|
guidelines: promptInputs.guidelines,
|
|
4057
4036
|
guideline_paths: evalCase.guideline_paths,
|
|
4058
4037
|
system_message: promptInputs.systemMessage ?? "",
|
|
@@ -4065,7 +4044,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
4065
4044
|
score: 0,
|
|
4066
4045
|
hits: [],
|
|
4067
4046
|
misses: [`Error: ${message}`],
|
|
4068
|
-
|
|
4047
|
+
candidate_answer: `Error occurred: ${message}`,
|
|
4069
4048
|
expected_aspect_count: 0,
|
|
4070
4049
|
target: targetName,
|
|
4071
4050
|
timestamp: timestamp.toISOString(),
|
|
@@ -4078,7 +4057,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
4078
4057
|
hash.update(provider.id);
|
|
4079
4058
|
hash.update(target.name);
|
|
4080
4059
|
hash.update(evalCase.id);
|
|
4081
|
-
hash.update(promptInputs.
|
|
4060
|
+
hash.update(promptInputs.question);
|
|
4082
4061
|
hash.update(promptInputs.guidelines);
|
|
4083
4062
|
hash.update(promptInputs.systemMessage ?? "");
|
|
4084
4063
|
return hash.digest("hex");
|