@agentv/core 0.6.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-OW3SHBIJ.js → chunk-UQLHF3T7.js} +12 -3
- package/dist/chunk-UQLHF3T7.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +143 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +1 -1
- package/dist/evaluation/validation/index.d.ts +1 -1
- package/dist/evaluation/validation/index.js +143 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +277 -328
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -13
- package/dist/index.d.ts +11 -13
- package/dist/index.js +267 -325
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-OW3SHBIJ.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -299,6 +299,87 @@ function extractCodeBlocks(segments) {
|
|
|
299
299
|
}
|
|
300
300
|
return codeBlocks;
|
|
301
301
|
}
|
|
302
|
+
async function processMessages(options) {
|
|
303
|
+
const {
|
|
304
|
+
messages,
|
|
305
|
+
searchRoots,
|
|
306
|
+
repoRootPath,
|
|
307
|
+
guidelinePatterns,
|
|
308
|
+
guidelinePaths,
|
|
309
|
+
textParts,
|
|
310
|
+
messageType,
|
|
311
|
+
verbose
|
|
312
|
+
} = options;
|
|
313
|
+
const segments = [];
|
|
314
|
+
for (const message of messages) {
|
|
315
|
+
const content = message.content;
|
|
316
|
+
if (typeof content === "string") {
|
|
317
|
+
segments.push({ type: "text", value: content });
|
|
318
|
+
if (textParts) {
|
|
319
|
+
textParts.push(content);
|
|
320
|
+
}
|
|
321
|
+
continue;
|
|
322
|
+
}
|
|
323
|
+
for (const rawSegment of content) {
|
|
324
|
+
if (!isJsonObject(rawSegment)) {
|
|
325
|
+
continue;
|
|
326
|
+
}
|
|
327
|
+
const segmentType = asString(rawSegment.type);
|
|
328
|
+
if (segmentType === "file") {
|
|
329
|
+
const rawValue = asString(rawSegment.value);
|
|
330
|
+
if (!rawValue) {
|
|
331
|
+
continue;
|
|
332
|
+
}
|
|
333
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
334
|
+
rawValue,
|
|
335
|
+
searchRoots
|
|
336
|
+
);
|
|
337
|
+
if (!resolvedPath) {
|
|
338
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
339
|
+
const context = messageType === "input" ? "" : " in expected_messages";
|
|
340
|
+
logWarning(`File not found${context}: ${displayPath}`, attempts);
|
|
341
|
+
continue;
|
|
342
|
+
}
|
|
343
|
+
try {
|
|
344
|
+
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
345
|
+
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
346
|
+
const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
|
|
347
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
348
|
+
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
349
|
+
if (verbose) {
|
|
350
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
351
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
352
|
+
}
|
|
353
|
+
continue;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
segments.push({
|
|
357
|
+
type: "file",
|
|
358
|
+
path: displayPath,
|
|
359
|
+
text: fileContent,
|
|
360
|
+
resolvedPath: import_node_path2.default.resolve(resolvedPath)
|
|
361
|
+
});
|
|
362
|
+
if (verbose) {
|
|
363
|
+
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
364
|
+
console.log(` ${label} Found: ${displayPath}`);
|
|
365
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
366
|
+
}
|
|
367
|
+
} catch (error) {
|
|
368
|
+
const context = messageType === "input" ? "" : " expected output";
|
|
369
|
+
logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
370
|
+
}
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
const clonedSegment = cloneJsonObject(rawSegment);
|
|
374
|
+
segments.push(clonedSegment);
|
|
375
|
+
const inlineValue = clonedSegment.value;
|
|
376
|
+
if (typeof inlineValue === "string" && textParts) {
|
|
377
|
+
textParts.push(inlineValue);
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
return segments;
|
|
382
|
+
}
|
|
302
383
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
303
384
|
const verbose = options?.verbose ?? false;
|
|
304
385
|
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
@@ -384,77 +465,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
384
465
|
}
|
|
385
466
|
}
|
|
386
467
|
}
|
|
387
|
-
const userSegments = [];
|
|
388
468
|
const guidelinePaths = [];
|
|
389
|
-
const
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
searchRoots
|
|
410
|
-
);
|
|
411
|
-
if (!resolvedPath) {
|
|
412
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
413
|
-
logWarning(`File not found: ${displayPath}`, attempts);
|
|
414
|
-
continue;
|
|
415
|
-
}
|
|
416
|
-
try {
|
|
417
|
-
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
418
|
-
const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
|
|
419
|
-
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
420
|
-
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
421
|
-
if (verbose) {
|
|
422
|
-
console.log(` [Guideline] Found: ${displayPath}`);
|
|
423
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
424
|
-
}
|
|
425
|
-
} else {
|
|
426
|
-
userSegments.push({
|
|
427
|
-
type: "file",
|
|
428
|
-
path: displayPath,
|
|
429
|
-
text: fileContent,
|
|
430
|
-
resolvedPath: import_node_path2.default.resolve(resolvedPath)
|
|
431
|
-
});
|
|
432
|
-
if (verbose) {
|
|
433
|
-
console.log(` [File] Found: ${displayPath}`);
|
|
434
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
} catch (error) {
|
|
438
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
439
|
-
}
|
|
440
|
-
continue;
|
|
441
|
-
}
|
|
442
|
-
const clonedSegment = cloneJsonObject(rawSegment);
|
|
443
|
-
userSegments.push(clonedSegment);
|
|
444
|
-
const inlineValue = clonedSegment.value;
|
|
445
|
-
if (typeof inlineValue === "string") {
|
|
446
|
-
userTextParts.push(inlineValue);
|
|
447
|
-
}
|
|
448
|
-
}
|
|
449
|
-
}
|
|
450
|
-
const codeSnippets = extractCodeBlocks(userSegments);
|
|
469
|
+
const inputTextParts = [];
|
|
470
|
+
const inputSegments = await processMessages({
|
|
471
|
+
messages: userMessages,
|
|
472
|
+
searchRoots,
|
|
473
|
+
repoRootPath,
|
|
474
|
+
guidelinePatterns,
|
|
475
|
+
guidelinePaths,
|
|
476
|
+
textParts: inputTextParts,
|
|
477
|
+
messageType: "input",
|
|
478
|
+
verbose
|
|
479
|
+
});
|
|
480
|
+
const outputSegments = await processMessages({
|
|
481
|
+
messages: assistantMessages,
|
|
482
|
+
searchRoots,
|
|
483
|
+
repoRootPath,
|
|
484
|
+
guidelinePatterns,
|
|
485
|
+
messageType: "output",
|
|
486
|
+
verbose
|
|
487
|
+
});
|
|
488
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
451
489
|
const assistantContent = assistantMessages[0]?.content;
|
|
452
|
-
const
|
|
453
|
-
const
|
|
490
|
+
const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
491
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
454
492
|
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
455
493
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
456
494
|
const userFilePaths = [];
|
|
457
|
-
for (const segment of
|
|
495
|
+
for (const segment of inputSegments) {
|
|
458
496
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
459
497
|
userFilePaths.push(segment.resolvedPath);
|
|
460
498
|
}
|
|
@@ -467,15 +505,16 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
467
505
|
id,
|
|
468
506
|
dataset: datasetName,
|
|
469
507
|
conversation_id: conversationId,
|
|
470
|
-
|
|
471
|
-
|
|
508
|
+
question,
|
|
509
|
+
input_segments: inputSegments,
|
|
510
|
+
output_segments: outputSegments,
|
|
472
511
|
system_message: systemMessageContent,
|
|
473
|
-
|
|
512
|
+
reference_answer: referenceAnswer,
|
|
474
513
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
475
514
|
guideline_patterns: guidelinePatterns,
|
|
476
515
|
file_paths: allFilePaths,
|
|
477
516
|
code_snippets: codeSnippets,
|
|
478
|
-
outcome,
|
|
517
|
+
expected_outcome: outcome,
|
|
479
518
|
evaluator: testCaseEvaluatorKind,
|
|
480
519
|
evaluators
|
|
481
520
|
};
|
|
@@ -511,36 +550,36 @@ ${content}`);
|
|
|
511
550
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
512
551
|
}
|
|
513
552
|
}
|
|
514
|
-
const
|
|
515
|
-
for (const segment of testCase.
|
|
553
|
+
const questionParts = [];
|
|
554
|
+
for (const segment of testCase.input_segments) {
|
|
516
555
|
const typeValue = segment.type;
|
|
517
556
|
if (typeof typeValue === "string" && typeValue === "file") {
|
|
518
557
|
const pathValue = segment.path;
|
|
519
558
|
const textValue = segment.text;
|
|
520
559
|
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
521
560
|
const body = typeof textValue === "string" ? textValue : "";
|
|
522
|
-
|
|
561
|
+
questionParts.push(`=== ${label} ===
|
|
523
562
|
${body}`);
|
|
524
563
|
continue;
|
|
525
564
|
}
|
|
526
565
|
if (typeof typeValue === "string" && typeValue === "text") {
|
|
527
566
|
const value = segment.value;
|
|
528
567
|
if (typeof value === "string") {
|
|
529
|
-
|
|
568
|
+
questionParts.push(value);
|
|
530
569
|
}
|
|
531
570
|
continue;
|
|
532
571
|
}
|
|
533
572
|
const genericValue = segment.value;
|
|
534
573
|
if (typeof genericValue === "string") {
|
|
535
|
-
|
|
574
|
+
questionParts.push(genericValue);
|
|
536
575
|
}
|
|
537
576
|
}
|
|
538
577
|
if (testCase.code_snippets.length > 0) {
|
|
539
|
-
|
|
578
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
540
579
|
}
|
|
541
|
-
const
|
|
580
|
+
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
542
581
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
543
|
-
return {
|
|
582
|
+
return { question, guidelines, systemMessage: testCase.system_message };
|
|
544
583
|
}
|
|
545
584
|
async function fileExists2(absolutePath) {
|
|
546
585
|
try {
|
|
@@ -752,7 +791,7 @@ function buildChatPrompt(request) {
|
|
|
752
791
|
${request.guidelines.trim()}`);
|
|
753
792
|
}
|
|
754
793
|
const systemContent = systemSegments.join("\n\n");
|
|
755
|
-
const userContent = request.
|
|
794
|
+
const userContent = request.question.trim();
|
|
756
795
|
const prompt = [
|
|
757
796
|
{
|
|
758
797
|
role: "system",
|
|
@@ -1050,7 +1089,7 @@ var CliProvider = class {
|
|
|
1050
1089
|
healthcheck.commandTemplate,
|
|
1051
1090
|
buildTemplateValues(
|
|
1052
1091
|
{
|
|
1053
|
-
|
|
1092
|
+
question: "",
|
|
1054
1093
|
guidelines: "",
|
|
1055
1094
|
inputFiles: [],
|
|
1056
1095
|
evalCaseId: "",
|
|
@@ -1077,7 +1116,7 @@ var CliProvider = class {
|
|
|
1077
1116
|
function buildTemplateValues(request, config) {
|
|
1078
1117
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
1079
1118
|
return {
|
|
1080
|
-
PROMPT: shellEscape(request.
|
|
1119
|
+
PROMPT: shellEscape(request.question ?? ""),
|
|
1081
1120
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
1082
1121
|
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
1083
1122
|
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
@@ -1141,6 +1180,59 @@ var import_node_os = require("os");
|
|
|
1141
1180
|
var import_node_path5 = __toESM(require("path"), 1);
|
|
1142
1181
|
var import_node_util2 = require("util");
|
|
1143
1182
|
|
|
1183
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1184
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1185
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1186
|
+
function getCodexLogStore() {
|
|
1187
|
+
const globalObject = globalThis;
|
|
1188
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1189
|
+
if (existing) {
|
|
1190
|
+
return existing;
|
|
1191
|
+
}
|
|
1192
|
+
const created = [];
|
|
1193
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1194
|
+
return created;
|
|
1195
|
+
}
|
|
1196
|
+
function getSubscriberStore() {
|
|
1197
|
+
const globalObject = globalThis;
|
|
1198
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1199
|
+
if (existing) {
|
|
1200
|
+
return existing;
|
|
1201
|
+
}
|
|
1202
|
+
const created = /* @__PURE__ */ new Set();
|
|
1203
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1204
|
+
return created;
|
|
1205
|
+
}
|
|
1206
|
+
function notifySubscribers(entry) {
|
|
1207
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1208
|
+
for (const listener of subscribers) {
|
|
1209
|
+
try {
|
|
1210
|
+
listener(entry);
|
|
1211
|
+
} catch (error) {
|
|
1212
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1213
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1214
|
+
}
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
function recordCodexLogEntry(entry) {
|
|
1218
|
+
getCodexLogStore().push(entry);
|
|
1219
|
+
notifySubscribers(entry);
|
|
1220
|
+
}
|
|
1221
|
+
function consumeCodexLogEntries() {
|
|
1222
|
+
const store = getCodexLogStore();
|
|
1223
|
+
if (store.length === 0) {
|
|
1224
|
+
return [];
|
|
1225
|
+
}
|
|
1226
|
+
return store.splice(0, store.length);
|
|
1227
|
+
}
|
|
1228
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1229
|
+
const store = getSubscriberStore();
|
|
1230
|
+
store.add(listener);
|
|
1231
|
+
return () => {
|
|
1232
|
+
store.delete(listener);
|
|
1233
|
+
};
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1144
1236
|
// src/evaluation/providers/preread.ts
|
|
1145
1237
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
1146
1238
|
function buildPromptDocument(request, inputFiles, options) {
|
|
@@ -1158,7 +1250,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
1158
1250
|
if (prereadBlock.length > 0) {
|
|
1159
1251
|
parts.push("\n", prereadBlock);
|
|
1160
1252
|
}
|
|
1161
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
1253
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
1162
1254
|
return parts.join("\n").trim();
|
|
1163
1255
|
}
|
|
1164
1256
|
function normalizeInputFiles2(inputFiles) {
|
|
@@ -1242,64 +1334,10 @@ function pathToFileUri(filePath) {
|
|
|
1242
1334
|
return `file://${normalizedPath}`;
|
|
1243
1335
|
}
|
|
1244
1336
|
|
|
1245
|
-
// src/evaluation/providers/codex-log-tracker.ts
|
|
1246
|
-
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1247
|
-
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1248
|
-
function getCodexLogStore() {
|
|
1249
|
-
const globalObject = globalThis;
|
|
1250
|
-
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1251
|
-
if (existing) {
|
|
1252
|
-
return existing;
|
|
1253
|
-
}
|
|
1254
|
-
const created = [];
|
|
1255
|
-
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1256
|
-
return created;
|
|
1257
|
-
}
|
|
1258
|
-
function getSubscriberStore() {
|
|
1259
|
-
const globalObject = globalThis;
|
|
1260
|
-
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1261
|
-
if (existing) {
|
|
1262
|
-
return existing;
|
|
1263
|
-
}
|
|
1264
|
-
const created = /* @__PURE__ */ new Set();
|
|
1265
|
-
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1266
|
-
return created;
|
|
1267
|
-
}
|
|
1268
|
-
function notifySubscribers(entry) {
|
|
1269
|
-
const subscribers = Array.from(getSubscriberStore());
|
|
1270
|
-
for (const listener of subscribers) {
|
|
1271
|
-
try {
|
|
1272
|
-
listener(entry);
|
|
1273
|
-
} catch (error) {
|
|
1274
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1275
|
-
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1276
|
-
}
|
|
1277
|
-
}
|
|
1278
|
-
}
|
|
1279
|
-
function recordCodexLogEntry(entry) {
|
|
1280
|
-
getCodexLogStore().push(entry);
|
|
1281
|
-
notifySubscribers(entry);
|
|
1282
|
-
}
|
|
1283
|
-
function consumeCodexLogEntries() {
|
|
1284
|
-
const store = getCodexLogStore();
|
|
1285
|
-
if (store.length === 0) {
|
|
1286
|
-
return [];
|
|
1287
|
-
}
|
|
1288
|
-
return store.splice(0, store.length);
|
|
1289
|
-
}
|
|
1290
|
-
function subscribeToCodexLogEntries(listener) {
|
|
1291
|
-
const store = getSubscriberStore();
|
|
1292
|
-
store.add(listener);
|
|
1293
|
-
return () => {
|
|
1294
|
-
store.delete(listener);
|
|
1295
|
-
};
|
|
1296
|
-
}
|
|
1297
|
-
|
|
1298
1337
|
// src/evaluation/providers/codex.ts
|
|
1299
1338
|
var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
|
|
1300
1339
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
1301
1340
|
var PROMPT_FILENAME = "prompt.md";
|
|
1302
|
-
var FILES_DIR = "files";
|
|
1303
1341
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
1304
1342
|
var CodexProvider = class {
|
|
1305
1343
|
id;
|
|
@@ -1322,21 +1360,10 @@ var CodexProvider = class {
|
|
|
1322
1360
|
}
|
|
1323
1361
|
await this.ensureEnvironmentReady();
|
|
1324
1362
|
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
1325
|
-
const originalGuidelines = new Set(
|
|
1326
|
-
collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => import_node_path5.default.resolve(file))
|
|
1327
|
-
);
|
|
1328
1363
|
const workspaceRoot = await this.createWorkspace();
|
|
1329
1364
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1330
1365
|
try {
|
|
1331
|
-
const
|
|
1332
|
-
inputFiles,
|
|
1333
|
-
workspaceRoot,
|
|
1334
|
-
originalGuidelines
|
|
1335
|
-
);
|
|
1336
|
-
const promptContent = buildPromptDocument(request, mirroredInputFiles, {
|
|
1337
|
-
guidelinePatterns: request.guideline_patterns,
|
|
1338
|
-
guidelineOverrides: guidelineMirrors
|
|
1339
|
-
});
|
|
1366
|
+
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1340
1367
|
const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
1341
1368
|
await (0, import_promises3.writeFile)(promptFile, promptContent, "utf8");
|
|
1342
1369
|
const args = this.buildCodexArgs();
|
|
@@ -1365,7 +1392,7 @@ var CodexProvider = class {
|
|
|
1365
1392
|
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1366
1393
|
promptFile,
|
|
1367
1394
|
workspace: workspaceRoot,
|
|
1368
|
-
inputFiles
|
|
1395
|
+
inputFiles,
|
|
1369
1396
|
logFile: logger?.filePath
|
|
1370
1397
|
}
|
|
1371
1398
|
};
|
|
@@ -1420,37 +1447,6 @@ var CodexProvider = class {
|
|
|
1420
1447
|
throw error;
|
|
1421
1448
|
}
|
|
1422
1449
|
}
|
|
1423
|
-
async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
|
|
1424
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
1425
|
-
return {
|
|
1426
|
-
mirroredInputFiles: void 0,
|
|
1427
|
-
guidelineMirrors: /* @__PURE__ */ new Set()
|
|
1428
|
-
};
|
|
1429
|
-
}
|
|
1430
|
-
const filesRoot = import_node_path5.default.join(workspaceRoot, FILES_DIR);
|
|
1431
|
-
await (0, import_promises3.mkdir)(filesRoot, { recursive: true });
|
|
1432
|
-
const mirrored = [];
|
|
1433
|
-
const guidelineMirrors = /* @__PURE__ */ new Set();
|
|
1434
|
-
const nameCounts = /* @__PURE__ */ new Map();
|
|
1435
|
-
for (const inputFile of inputFiles) {
|
|
1436
|
-
const absoluteSource = import_node_path5.default.resolve(inputFile);
|
|
1437
|
-
const baseName = import_node_path5.default.basename(absoluteSource);
|
|
1438
|
-
const count = nameCounts.get(baseName) ?? 0;
|
|
1439
|
-
nameCounts.set(baseName, count + 1);
|
|
1440
|
-
const finalName = count === 0 ? baseName : `${baseName}.${count}`;
|
|
1441
|
-
const destination = import_node_path5.default.join(filesRoot, finalName);
|
|
1442
|
-
await (0, import_promises3.copyFile)(absoluteSource, destination);
|
|
1443
|
-
const resolvedDestination = import_node_path5.default.resolve(destination);
|
|
1444
|
-
mirrored.push(resolvedDestination);
|
|
1445
|
-
if (guidelineOriginals.has(absoluteSource)) {
|
|
1446
|
-
guidelineMirrors.add(resolvedDestination);
|
|
1447
|
-
}
|
|
1448
|
-
}
|
|
1449
|
-
return {
|
|
1450
|
-
mirroredInputFiles: mirrored,
|
|
1451
|
-
guidelineMirrors
|
|
1452
|
-
};
|
|
1453
|
-
}
|
|
1454
1450
|
async createWorkspace() {
|
|
1455
1451
|
return await (0, import_promises3.mkdtemp)(import_node_path5.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
|
|
1456
1452
|
}
|
|
@@ -2028,7 +2024,7 @@ var MockProvider = class {
|
|
|
2028
2024
|
return {
|
|
2029
2025
|
text: this.cannedResponse,
|
|
2030
2026
|
raw: {
|
|
2031
|
-
|
|
2027
|
+
question: request.question,
|
|
2032
2028
|
guidelines: request.guidelines
|
|
2033
2029
|
}
|
|
2034
2030
|
};
|
|
@@ -2421,23 +2417,25 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
2421
2417
|
if (trimmed.length === 0) {
|
|
2422
2418
|
return void 0;
|
|
2423
2419
|
}
|
|
2424
|
-
const
|
|
2425
|
-
if (
|
|
2426
|
-
|
|
2427
|
-
|
|
2420
|
+
const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
2421
|
+
if (envVarMatch) {
|
|
2422
|
+
const varName = envVarMatch[1];
|
|
2423
|
+
const envValue = env[varName];
|
|
2424
|
+
if (envValue !== void 0) {
|
|
2425
|
+
if (envValue.trim().length === 0) {
|
|
2426
|
+
throw new Error(`Environment variable '${varName}' for ${description} is empty`);
|
|
2427
|
+
}
|
|
2428
|
+
return envValue;
|
|
2428
2429
|
}
|
|
2429
|
-
|
|
2430
|
-
}
|
|
2431
|
-
const allowLiteral = options?.allowLiteral ?? false;
|
|
2432
|
-
const optionalEnv = options?.optionalEnv ?? false;
|
|
2433
|
-
const looksLikeEnv = isLikelyEnvReference(trimmed);
|
|
2434
|
-
if (looksLikeEnv) {
|
|
2430
|
+
const optionalEnv = options?.optionalEnv ?? false;
|
|
2435
2431
|
if (optionalEnv) {
|
|
2436
2432
|
return void 0;
|
|
2437
2433
|
}
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2434
|
+
throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
|
|
2435
|
+
}
|
|
2436
|
+
const allowLiteral = options?.allowLiteral ?? false;
|
|
2437
|
+
if (!allowLiteral) {
|
|
2438
|
+
throw new Error(`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`);
|
|
2441
2439
|
}
|
|
2442
2440
|
return trimmed;
|
|
2443
2441
|
}
|
|
@@ -2484,9 +2482,6 @@ function resolveOptionalBoolean(source) {
|
|
|
2484
2482
|
}
|
|
2485
2483
|
throw new Error("expected boolean value");
|
|
2486
2484
|
}
|
|
2487
|
-
function isLikelyEnvReference(value) {
|
|
2488
|
-
return /^[A-Z0-9_]+$/.test(value);
|
|
2489
|
-
}
|
|
2490
2485
|
function resolveOptionalStringArray(source, env, description) {
|
|
2491
2486
|
if (source === void 0 || source === null) {
|
|
2492
2487
|
return void 0;
|
|
@@ -2507,21 +2502,25 @@ function resolveOptionalStringArray(source, env, description) {
|
|
|
2507
2502
|
if (trimmed.length === 0) {
|
|
2508
2503
|
throw new Error(`${description}[${i}] cannot be empty`);
|
|
2509
2504
|
}
|
|
2510
|
-
const
|
|
2511
|
-
if (
|
|
2512
|
-
|
|
2513
|
-
|
|
2505
|
+
const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
2506
|
+
if (envVarMatch) {
|
|
2507
|
+
const varName = envVarMatch[1];
|
|
2508
|
+
const envValue = env[varName];
|
|
2509
|
+
if (envValue !== void 0) {
|
|
2510
|
+
if (envValue.trim().length === 0) {
|
|
2511
|
+
throw new Error(`Environment variable '${varName}' for ${description}[${i}] is empty`);
|
|
2512
|
+
}
|
|
2513
|
+
resolved.push(envValue);
|
|
2514
|
+
continue;
|
|
2514
2515
|
}
|
|
2515
|
-
|
|
2516
|
-
} else {
|
|
2517
|
-
resolved.push(trimmed);
|
|
2516
|
+
throw new Error(`Environment variable '${varName}' for ${description}[${i}] is not set`);
|
|
2518
2517
|
}
|
|
2518
|
+
resolved.push(trimmed);
|
|
2519
2519
|
}
|
|
2520
2520
|
return resolved.length > 0 ? resolved : void 0;
|
|
2521
2521
|
}
|
|
2522
2522
|
|
|
2523
2523
|
// src/evaluation/providers/vscode.ts
|
|
2524
|
-
var import_promises4 = require("fs/promises");
|
|
2525
2524
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
2526
2525
|
var import_subagent = require("subagent");
|
|
2527
2526
|
var VSCodeProvider = class {
|
|
@@ -2565,7 +2564,7 @@ var VSCodeProvider = class {
|
|
|
2565
2564
|
}
|
|
2566
2565
|
};
|
|
2567
2566
|
}
|
|
2568
|
-
const responseText = await (
|
|
2567
|
+
const responseText = await readTextFile(session.responseFile);
|
|
2569
2568
|
return {
|
|
2570
2569
|
text: responseText,
|
|
2571
2570
|
raw: {
|
|
@@ -2619,7 +2618,7 @@ var VSCodeProvider = class {
|
|
|
2619
2618
|
}
|
|
2620
2619
|
const responses = [];
|
|
2621
2620
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
2622
|
-
const responseText = await (
|
|
2621
|
+
const responseText = await readTextFile(responseFile);
|
|
2623
2622
|
responses.push({
|
|
2624
2623
|
text: responseText,
|
|
2625
2624
|
raw: {
|
|
@@ -2644,7 +2643,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
2644
2643
|
if (prereadBlock.length > 0) {
|
|
2645
2644
|
parts.push("\n", prereadBlock);
|
|
2646
2645
|
}
|
|
2647
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
2646
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2648
2647
|
return parts.join("\n").trim();
|
|
2649
2648
|
}
|
|
2650
2649
|
function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
@@ -2769,12 +2768,20 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
2769
2768
|
|
|
2770
2769
|
// src/evaluation/providers/targets-file.ts
|
|
2771
2770
|
var import_node_fs4 = require("fs");
|
|
2772
|
-
var
|
|
2771
|
+
var import_promises4 = require("fs/promises");
|
|
2773
2772
|
var import_node_path7 = __toESM(require("path"), 1);
|
|
2774
2773
|
var import_yaml2 = require("yaml");
|
|
2775
2774
|
|
|
2776
2775
|
// src/evaluation/providers/types.ts
|
|
2777
|
-
var
|
|
2776
|
+
var AGENT_PROVIDER_KINDS = [
|
|
2777
|
+
"codex",
|
|
2778
|
+
"vscode",
|
|
2779
|
+
"vscode-insiders"
|
|
2780
|
+
];
|
|
2781
|
+
var TARGETS_SCHEMA_V2 = "agentv-targets-v2.1";
|
|
2782
|
+
function isAgentProvider(provider) {
|
|
2783
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
2784
|
+
}
|
|
2778
2785
|
|
|
2779
2786
|
// src/evaluation/providers/targets-file.ts
|
|
2780
2787
|
function isRecord(value) {
|
|
@@ -2831,7 +2838,7 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
2831
2838
|
}
|
|
2832
2839
|
async function fileExists3(filePath) {
|
|
2833
2840
|
try {
|
|
2834
|
-
await (0,
|
|
2841
|
+
await (0, import_promises4.access)(filePath, import_node_fs4.constants.F_OK);
|
|
2835
2842
|
return true;
|
|
2836
2843
|
} catch {
|
|
2837
2844
|
return false;
|
|
@@ -2842,7 +2849,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
2842
2849
|
if (!await fileExists3(absolutePath)) {
|
|
2843
2850
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
2844
2851
|
}
|
|
2845
|
-
const raw = await (0,
|
|
2852
|
+
const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
|
|
2846
2853
|
const parsed = (0, import_yaml2.parse)(raw);
|
|
2847
2854
|
if (!isRecord(parsed)) {
|
|
2848
2855
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
@@ -2886,30 +2893,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2886
2893
|
}
|
|
2887
2894
|
|
|
2888
2895
|
// src/evaluation/evaluators.ts
|
|
2889
|
-
var import_ax3 = require("@ax-llm/ax");
|
|
2890
2896
|
var import_node_crypto2 = require("crypto");
|
|
2891
|
-
var LLM_JUDGE_SIGNATURE = (0, import_ax3.f)().input(
|
|
2892
|
-
"evaluationContext",
|
|
2893
|
-
import_ax3.f.object(
|
|
2894
|
-
{
|
|
2895
|
-
expectedOutcome: import_ax3.f.string("The expected outcome for the original task"),
|
|
2896
|
-
request: import_ax3.f.string("The original task request"),
|
|
2897
|
-
referenceAnswer: import_ax3.f.string("The gold standard reference answer"),
|
|
2898
|
-
generatedAnswer: import_ax3.f.string("The answer to evaluate"),
|
|
2899
|
-
guidelines: import_ax3.f.string("Additional evaluation guidelines or instructions").optional()
|
|
2900
|
-
},
|
|
2901
|
-
"Complete evaluation context for the judge"
|
|
2902
|
-
)
|
|
2903
|
-
).output(
|
|
2904
|
-
"evaluation",
|
|
2905
|
-
import_ax3.f.object({
|
|
2906
|
-
score: import_ax3.f.number("Score between 0.0 and 1.0").min(0).max(1),
|
|
2907
|
-
hits: import_ax3.f.string("Brief specific achievement").array(),
|
|
2908
|
-
misses: import_ax3.f.string("Brief specific failure or omission").array(),
|
|
2909
|
-
reasoning: import_ax3.f.string("Concise explanation for the score").max(500)
|
|
2910
|
-
})
|
|
2911
|
-
).build();
|
|
2912
|
-
var LLM_JUDGE = (0, import_ax3.ax)(LLM_JUDGE_SIGNATURE);
|
|
2913
2897
|
var LlmJudgeEvaluator = class {
|
|
2914
2898
|
kind = "llm_judge";
|
|
2915
2899
|
resolveJudgeProvider;
|
|
@@ -2927,52 +2911,29 @@ var LlmJudgeEvaluator = class {
|
|
|
2927
2911
|
if (!judgeProvider) {
|
|
2928
2912
|
throw new Error("No judge provider available for LLM grading");
|
|
2929
2913
|
}
|
|
2930
|
-
if (providerSupportsAx(judgeProvider)) {
|
|
2931
|
-
return this.evaluateWithAx(context, judgeProvider);
|
|
2932
|
-
}
|
|
2933
2914
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2934
2915
|
}
|
|
2935
|
-
async evaluateWithAx(context, judgeProvider) {
|
|
2936
|
-
const ai = judgeProvider.getAxAI();
|
|
2937
|
-
const guidelines = context.promptInputs.guidelines?.trim();
|
|
2938
|
-
const evaluationContext = {
|
|
2939
|
-
expectedOutcome: context.evalCase.outcome.trim(),
|
|
2940
|
-
request: context.evalCase.task.trim(),
|
|
2941
|
-
referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
|
|
2942
|
-
generatedAnswer: context.candidate.trim(),
|
|
2943
|
-
...guidelines ? { guidelines } : {}
|
|
2944
|
-
};
|
|
2945
|
-
const options = this.buildJudgeForwardOptions(context);
|
|
2946
|
-
const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
|
|
2947
|
-
const evaluation = result.evaluation;
|
|
2948
|
-
const expectedAspectCount = Math.max(
|
|
2949
|
-
evaluation.hits.length + evaluation.misses.length,
|
|
2950
|
-
1
|
|
2951
|
-
);
|
|
2952
|
-
return {
|
|
2953
|
-
score: evaluation.score,
|
|
2954
|
-
hits: evaluation.hits,
|
|
2955
|
-
misses: evaluation.misses,
|
|
2956
|
-
expectedAspectCount,
|
|
2957
|
-
reasoning: evaluation.reasoning,
|
|
2958
|
-
evaluatorRawRequest: {
|
|
2959
|
-
id: (0, import_node_crypto2.randomUUID)(),
|
|
2960
|
-
provider: judgeProvider.id,
|
|
2961
|
-
target: context.target.name,
|
|
2962
|
-
method: "ax-structured-output",
|
|
2963
|
-
signature: LLM_JUDGE_SIGNATURE.toString()
|
|
2964
|
-
}
|
|
2965
|
-
};
|
|
2966
|
-
}
|
|
2967
2916
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2968
|
-
|
|
2969
|
-
|
|
2917
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2918
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2919
|
+
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2920
|
+
const variables = {
|
|
2921
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2922
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2923
|
+
candidate_answer: context.candidate,
|
|
2924
|
+
reference_answer: context.evalCase.reference_answer,
|
|
2925
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
2926
|
+
question: context.evalCase.question
|
|
2927
|
+
};
|
|
2928
|
+
prompt = substituteVariables(systemPrompt, variables);
|
|
2929
|
+
systemPrompt = QUALITY_SYSTEM_PROMPT;
|
|
2930
|
+
}
|
|
2970
2931
|
const metadata = {
|
|
2971
2932
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2972
2933
|
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2973
2934
|
};
|
|
2974
2935
|
const response = await judgeProvider.invoke({
|
|
2975
|
-
prompt,
|
|
2936
|
+
question: prompt,
|
|
2976
2937
|
metadata,
|
|
2977
2938
|
evalCaseId: context.evalCase.id,
|
|
2978
2939
|
attempt: context.attempt,
|
|
@@ -3002,33 +2963,11 @@ var LlmJudgeEvaluator = class {
|
|
|
3002
2963
|
evaluatorRawRequest
|
|
3003
2964
|
};
|
|
3004
2965
|
}
|
|
3005
|
-
buildJudgeForwardOptions(context) {
|
|
3006
|
-
const modelConfig = this.buildJudgeModelConfig();
|
|
3007
|
-
if (modelConfig === void 0 && context.judgeModel === void 0) {
|
|
3008
|
-
return void 0;
|
|
3009
|
-
}
|
|
3010
|
-
return {
|
|
3011
|
-
...context.judgeModel ? { model: context.judgeModel } : {},
|
|
3012
|
-
...modelConfig ? { modelConfig } : {}
|
|
3013
|
-
};
|
|
3014
|
-
}
|
|
3015
|
-
buildJudgeModelConfig() {
|
|
3016
|
-
if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
|
|
3017
|
-
return void 0;
|
|
3018
|
-
}
|
|
3019
|
-
return {
|
|
3020
|
-
...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
|
|
3021
|
-
...this.temperature !== void 0 ? { temperature: this.temperature } : {}
|
|
3022
|
-
};
|
|
3023
|
-
}
|
|
3024
2966
|
};
|
|
3025
|
-
function providerSupportsAx(provider) {
|
|
3026
|
-
return typeof provider.getAxAI === "function";
|
|
3027
|
-
}
|
|
3028
2967
|
var QUALITY_SYSTEM_PROMPT = [
|
|
3029
|
-
"You are an expert evaluator. Your goal is to grade the
|
|
2968
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
3030
2969
|
"",
|
|
3031
|
-
"Use the reference_answer as a gold standard for a high-quality response. The
|
|
2970
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
|
|
3032
2971
|
"",
|
|
3033
2972
|
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
3034
2973
|
"",
|
|
@@ -3041,18 +2980,18 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
3041
2980
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
3042
2981
|
"}"
|
|
3043
2982
|
].join("\n");
|
|
3044
|
-
function buildQualityPrompt(
|
|
2983
|
+
function buildQualityPrompt(evalCase, candidate) {
|
|
3045
2984
|
const parts = [
|
|
3046
2985
|
"[[ ## expected_outcome ## ]]",
|
|
3047
|
-
|
|
2986
|
+
evalCase.expected_outcome.trim(),
|
|
3048
2987
|
"",
|
|
3049
|
-
"[[ ##
|
|
3050
|
-
|
|
2988
|
+
"[[ ## question ## ]]",
|
|
2989
|
+
evalCase.question.trim(),
|
|
3051
2990
|
"",
|
|
3052
2991
|
"[[ ## reference_answer ## ]]",
|
|
3053
|
-
|
|
2992
|
+
evalCase.reference_answer.trim(),
|
|
3054
2993
|
"",
|
|
3055
|
-
"[[ ##
|
|
2994
|
+
"[[ ## candidate_answer ## ]]",
|
|
3056
2995
|
candidate.trim(),
|
|
3057
2996
|
"",
|
|
3058
2997
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
@@ -3152,14 +3091,14 @@ var CodeEvaluator = class {
|
|
|
3152
3091
|
async evaluate(context) {
|
|
3153
3092
|
const inputPayload = JSON.stringify(
|
|
3154
3093
|
{
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
|
|
3094
|
+
question: context.evalCase.question,
|
|
3095
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
3096
|
+
reference_answer: context.evalCase.reference_answer,
|
|
3097
|
+
candidate_answer: context.candidate,
|
|
3159
3098
|
system_message: context.promptInputs.systemMessage ?? "",
|
|
3160
3099
|
guideline_paths: context.evalCase.guideline_paths,
|
|
3161
|
-
|
|
3162
|
-
|
|
3100
|
+
input_files: context.evalCase.file_paths,
|
|
3101
|
+
input_segments: context.evalCase.input_segments
|
|
3163
3102
|
},
|
|
3164
3103
|
null,
|
|
3165
3104
|
2
|
|
@@ -3245,10 +3184,18 @@ function parseJsonSafe(payload) {
|
|
|
3245
3184
|
return void 0;
|
|
3246
3185
|
}
|
|
3247
3186
|
}
|
|
3187
|
+
function hasTemplateVariables(text) {
|
|
3188
|
+
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
3189
|
+
}
|
|
3190
|
+
function substituteVariables(template, variables) {
|
|
3191
|
+
return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
|
|
3192
|
+
return variables[varName] ?? match;
|
|
3193
|
+
});
|
|
3194
|
+
}
|
|
3248
3195
|
|
|
3249
3196
|
// src/evaluation/orchestrator.ts
|
|
3250
3197
|
var import_node_crypto3 = require("crypto");
|
|
3251
|
-
var
|
|
3198
|
+
var import_promises5 = require("fs/promises");
|
|
3252
3199
|
var import_node_path8 = __toESM(require("path"), 1);
|
|
3253
3200
|
|
|
3254
3201
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
@@ -3567,7 +3514,8 @@ async function runEvaluation(options) {
|
|
|
3567
3514
|
target.name,
|
|
3568
3515
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
3569
3516
|
outcome.reason,
|
|
3570
|
-
promptInputs
|
|
3517
|
+
promptInputs,
|
|
3518
|
+
primaryProvider
|
|
3571
3519
|
);
|
|
3572
3520
|
results.push(errorResult);
|
|
3573
3521
|
if (onResult) {
|
|
@@ -3601,7 +3549,7 @@ async function runBatchEvaluation(options) {
|
|
|
3601
3549
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
3602
3550
|
const promptInputs = promptInputsList[index];
|
|
3603
3551
|
return {
|
|
3604
|
-
|
|
3552
|
+
question: promptInputs.question,
|
|
3605
3553
|
guidelines: promptInputs.guidelines,
|
|
3606
3554
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3607
3555
|
inputFiles: evalCase.file_paths,
|
|
@@ -3651,7 +3599,7 @@ async function runBatchEvaluation(options) {
|
|
|
3651
3599
|
agentTimeoutMs
|
|
3652
3600
|
});
|
|
3653
3601
|
} catch (error) {
|
|
3654
|
-
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3602
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3655
3603
|
results.push(errorResult);
|
|
3656
3604
|
if (onResult) {
|
|
3657
3605
|
await onResult(errorResult);
|
|
@@ -3728,7 +3676,7 @@ async function runEvalCase(options) {
|
|
|
3728
3676
|
attempt += 1;
|
|
3729
3677
|
continue;
|
|
3730
3678
|
}
|
|
3731
|
-
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3679
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3732
3680
|
}
|
|
3733
3681
|
}
|
|
3734
3682
|
if (!providerResponse) {
|
|
@@ -3737,7 +3685,8 @@ async function runEvalCase(options) {
|
|
|
3737
3685
|
target.name,
|
|
3738
3686
|
nowFn(),
|
|
3739
3687
|
lastError ?? new Error("Provider did not return a response"),
|
|
3740
|
-
promptInputs
|
|
3688
|
+
promptInputs,
|
|
3689
|
+
provider
|
|
3741
3690
|
);
|
|
3742
3691
|
}
|
|
3743
3692
|
if (cacheKey && cache && !cachedResponse) {
|
|
@@ -3757,7 +3706,7 @@ async function runEvalCase(options) {
|
|
|
3757
3706
|
agentTimeoutMs
|
|
3758
3707
|
});
|
|
3759
3708
|
} catch (error) {
|
|
3760
|
-
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3709
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3761
3710
|
}
|
|
3762
3711
|
}
|
|
3763
3712
|
async function evaluateCandidate(options) {
|
|
@@ -3788,8 +3737,8 @@ async function evaluateCandidate(options) {
|
|
|
3788
3737
|
});
|
|
3789
3738
|
const completedAt = nowFn();
|
|
3790
3739
|
const rawRequest = {
|
|
3791
|
-
|
|
3792
|
-
guidelines: promptInputs.guidelines,
|
|
3740
|
+
question: promptInputs.question,
|
|
3741
|
+
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
3793
3742
|
guideline_paths: evalCase.guideline_paths,
|
|
3794
3743
|
system_message: promptInputs.systemMessage ?? ""
|
|
3795
3744
|
};
|
|
@@ -3800,7 +3749,7 @@ async function evaluateCandidate(options) {
|
|
|
3800
3749
|
score: score.score,
|
|
3801
3750
|
hits: score.hits,
|
|
3802
3751
|
misses: score.misses,
|
|
3803
|
-
|
|
3752
|
+
candidate_answer: candidate,
|
|
3804
3753
|
expected_aspect_count: score.expectedAspectCount,
|
|
3805
3754
|
target: target.name,
|
|
3806
3755
|
timestamp: completedAt.toISOString(),
|
|
@@ -4007,14 +3956,14 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
|
4007
3956
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
4008
3957
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
4009
3958
|
const filePath = import_node_path8.default.resolve(directory, filename);
|
|
4010
|
-
await (0,
|
|
3959
|
+
await (0, import_promises5.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
|
|
4011
3960
|
const payload = {
|
|
4012
3961
|
eval_id: evalCase.id,
|
|
4013
|
-
|
|
3962
|
+
question: promptInputs.question,
|
|
4014
3963
|
guidelines: promptInputs.guidelines,
|
|
4015
3964
|
guideline_paths: evalCase.guideline_paths
|
|
4016
3965
|
};
|
|
4017
|
-
await (0,
|
|
3966
|
+
await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
4018
3967
|
}
|
|
4019
3968
|
function sanitizeFilename(value) {
|
|
4020
3969
|
if (!value) {
|
|
@@ -4032,7 +3981,7 @@ async function invokeProvider(provider, options) {
|
|
|
4032
3981
|
}
|
|
4033
3982
|
try {
|
|
4034
3983
|
return await provider.invoke({
|
|
4035
|
-
|
|
3984
|
+
question: promptInputs.question,
|
|
4036
3985
|
guidelines: promptInputs.guidelines,
|
|
4037
3986
|
guideline_patterns: evalCase.guideline_patterns,
|
|
4038
3987
|
inputFiles: evalCase.file_paths,
|
|
@@ -4049,11 +3998,11 @@ async function invokeProvider(provider, options) {
|
|
|
4049
3998
|
}
|
|
4050
3999
|
}
|
|
4051
4000
|
}
|
|
4052
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
4001
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
4053
4002
|
const message = error instanceof Error ? error.message : String(error);
|
|
4054
4003
|
const rawRequest = {
|
|
4055
|
-
|
|
4056
|
-
guidelines: promptInputs.guidelines,
|
|
4004
|
+
question: promptInputs.question,
|
|
4005
|
+
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
4057
4006
|
guideline_paths: evalCase.guideline_paths,
|
|
4058
4007
|
system_message: promptInputs.systemMessage ?? "",
|
|
4059
4008
|
error: message
|
|
@@ -4065,7 +4014,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
4065
4014
|
score: 0,
|
|
4066
4015
|
hits: [],
|
|
4067
4016
|
misses: [`Error: ${message}`],
|
|
4068
|
-
|
|
4017
|
+
candidate_answer: `Error occurred: ${message}`,
|
|
4069
4018
|
expected_aspect_count: 0,
|
|
4070
4019
|
target: targetName,
|
|
4071
4020
|
timestamp: timestamp.toISOString(),
|
|
@@ -4078,7 +4027,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
4078
4027
|
hash.update(provider.id);
|
|
4079
4028
|
hash.update(target.name);
|
|
4080
4029
|
hash.update(evalCase.id);
|
|
4081
|
-
hash.update(promptInputs.
|
|
4030
|
+
hash.update(promptInputs.question);
|
|
4082
4031
|
hash.update(promptInputs.guidelines);
|
|
4083
4032
|
hash.update(promptInputs.systemMessage ?? "");
|
|
4084
4033
|
return hash.digest("hex");
|