@agentv/core 0.5.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-NL7K4CAK.js → chunk-L7I5UTJU.js} +7 -2
- package/dist/chunk-L7I5UTJU.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +260 -114
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +34 -10
- package/dist/index.d.ts +34 -10
- package/dist/index.js +255 -115
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-NL7K4CAK.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -36,6 +36,7 @@ __export(index_exports, {
|
|
|
36
36
|
buildDirectoryChain: () => buildDirectoryChain,
|
|
37
37
|
buildPromptInputs: () => buildPromptInputs,
|
|
38
38
|
buildSearchRoots: () => buildSearchRoots,
|
|
39
|
+
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
39
40
|
createAgentKernel: () => createAgentKernel,
|
|
40
41
|
createProvider: () => createProvider,
|
|
41
42
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
@@ -52,11 +53,13 @@ __export(index_exports, {
|
|
|
52
53
|
listTargetNames: () => listTargetNames,
|
|
53
54
|
loadEvalCases: () => loadEvalCases,
|
|
54
55
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
56
|
+
readTextFile: () => readTextFile,
|
|
55
57
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
56
58
|
resolveFileReference: () => resolveFileReference,
|
|
57
59
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
58
60
|
runEvalCase: () => runEvalCase,
|
|
59
|
-
runEvaluation: () => runEvaluation
|
|
61
|
+
runEvaluation: () => runEvaluation,
|
|
62
|
+
subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
|
|
60
63
|
});
|
|
61
64
|
module.exports = __toCommonJS(index_exports);
|
|
62
65
|
|
|
@@ -130,6 +133,10 @@ async function fileExists(filePath) {
|
|
|
130
133
|
return false;
|
|
131
134
|
}
|
|
132
135
|
}
|
|
136
|
+
async function readTextFile(filePath) {
|
|
137
|
+
const content = await (0, import_promises.readFile)(filePath, "utf8");
|
|
138
|
+
return content.replace(/\r\n/g, "\n");
|
|
139
|
+
}
|
|
133
140
|
async function findGitRoot(startPath) {
|
|
134
141
|
let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
|
|
135
142
|
const root = import_node_path.default.parse(currentDir).root;
|
|
@@ -292,6 +299,87 @@ function extractCodeBlocks(segments) {
|
|
|
292
299
|
}
|
|
293
300
|
return codeBlocks;
|
|
294
301
|
}
|
|
302
|
+
async function processMessages(options) {
|
|
303
|
+
const {
|
|
304
|
+
messages,
|
|
305
|
+
searchRoots,
|
|
306
|
+
repoRootPath,
|
|
307
|
+
guidelinePatterns,
|
|
308
|
+
guidelinePaths,
|
|
309
|
+
textParts,
|
|
310
|
+
messageType,
|
|
311
|
+
verbose
|
|
312
|
+
} = options;
|
|
313
|
+
const segments = [];
|
|
314
|
+
for (const message of messages) {
|
|
315
|
+
const content = message.content;
|
|
316
|
+
if (typeof content === "string") {
|
|
317
|
+
segments.push({ type: "text", value: content });
|
|
318
|
+
if (textParts) {
|
|
319
|
+
textParts.push(content);
|
|
320
|
+
}
|
|
321
|
+
continue;
|
|
322
|
+
}
|
|
323
|
+
for (const rawSegment of content) {
|
|
324
|
+
if (!isJsonObject(rawSegment)) {
|
|
325
|
+
continue;
|
|
326
|
+
}
|
|
327
|
+
const segmentType = asString(rawSegment.type);
|
|
328
|
+
if (segmentType === "file") {
|
|
329
|
+
const rawValue = asString(rawSegment.value);
|
|
330
|
+
if (!rawValue) {
|
|
331
|
+
continue;
|
|
332
|
+
}
|
|
333
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
334
|
+
rawValue,
|
|
335
|
+
searchRoots
|
|
336
|
+
);
|
|
337
|
+
if (!resolvedPath) {
|
|
338
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
339
|
+
const context = messageType === "input" ? "" : " in expected_messages";
|
|
340
|
+
logWarning(`File not found${context}: ${displayPath}`, attempts);
|
|
341
|
+
continue;
|
|
342
|
+
}
|
|
343
|
+
try {
|
|
344
|
+
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
345
|
+
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
346
|
+
const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
|
|
347
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
348
|
+
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
349
|
+
if (verbose) {
|
|
350
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
351
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
352
|
+
}
|
|
353
|
+
continue;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
segments.push({
|
|
357
|
+
type: "file",
|
|
358
|
+
path: displayPath,
|
|
359
|
+
text: fileContent,
|
|
360
|
+
resolvedPath: import_node_path2.default.resolve(resolvedPath)
|
|
361
|
+
});
|
|
362
|
+
if (verbose) {
|
|
363
|
+
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
364
|
+
console.log(` ${label} Found: ${displayPath}`);
|
|
365
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
366
|
+
}
|
|
367
|
+
} catch (error) {
|
|
368
|
+
const context = messageType === "input" ? "" : " expected output";
|
|
369
|
+
logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
370
|
+
}
|
|
371
|
+
continue;
|
|
372
|
+
}
|
|
373
|
+
const clonedSegment = cloneJsonObject(rawSegment);
|
|
374
|
+
segments.push(clonedSegment);
|
|
375
|
+
const inlineValue = clonedSegment.value;
|
|
376
|
+
if (typeof inlineValue === "string" && textParts) {
|
|
377
|
+
textParts.push(inlineValue);
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
return segments;
|
|
382
|
+
}
|
|
295
383
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
296
384
|
const verbose = options?.verbose ?? false;
|
|
297
385
|
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
@@ -308,6 +396,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
308
396
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
309
397
|
}
|
|
310
398
|
const suite = parsed;
|
|
399
|
+
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
400
|
+
const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
401
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
311
402
|
const schema = suite.$schema;
|
|
312
403
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
313
404
|
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
@@ -374,77 +465,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
374
465
|
}
|
|
375
466
|
}
|
|
376
467
|
}
|
|
377
|
-
const userSegments = [];
|
|
378
468
|
const guidelinePaths = [];
|
|
379
|
-
const
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
searchRoots
|
|
400
|
-
);
|
|
401
|
-
if (!resolvedPath) {
|
|
402
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
403
|
-
logWarning(`File not found: ${displayPath}`, attempts);
|
|
404
|
-
continue;
|
|
405
|
-
}
|
|
406
|
-
try {
|
|
407
|
-
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
408
|
-
const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
|
|
409
|
-
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
410
|
-
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
411
|
-
if (verbose) {
|
|
412
|
-
console.log(` [Guideline] Found: ${displayPath}`);
|
|
413
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
414
|
-
}
|
|
415
|
-
} else {
|
|
416
|
-
userSegments.push({
|
|
417
|
-
type: "file",
|
|
418
|
-
path: displayPath,
|
|
419
|
-
text: fileContent,
|
|
420
|
-
resolvedPath: import_node_path2.default.resolve(resolvedPath)
|
|
421
|
-
});
|
|
422
|
-
if (verbose) {
|
|
423
|
-
console.log(` [File] Found: ${displayPath}`);
|
|
424
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
} catch (error) {
|
|
428
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
429
|
-
}
|
|
430
|
-
continue;
|
|
431
|
-
}
|
|
432
|
-
const clonedSegment = cloneJsonObject(rawSegment);
|
|
433
|
-
userSegments.push(clonedSegment);
|
|
434
|
-
const inlineValue = clonedSegment.value;
|
|
435
|
-
if (typeof inlineValue === "string") {
|
|
436
|
-
userTextParts.push(inlineValue);
|
|
437
|
-
}
|
|
438
|
-
}
|
|
439
|
-
}
|
|
440
|
-
const codeSnippets = extractCodeBlocks(userSegments);
|
|
469
|
+
const inputTextParts = [];
|
|
470
|
+
const inputSegments = await processMessages({
|
|
471
|
+
messages: userMessages,
|
|
472
|
+
searchRoots,
|
|
473
|
+
repoRootPath,
|
|
474
|
+
guidelinePatterns,
|
|
475
|
+
guidelinePaths,
|
|
476
|
+
textParts: inputTextParts,
|
|
477
|
+
messageType: "input",
|
|
478
|
+
verbose
|
|
479
|
+
});
|
|
480
|
+
const outputSegments = await processMessages({
|
|
481
|
+
messages: assistantMessages,
|
|
482
|
+
searchRoots,
|
|
483
|
+
repoRootPath,
|
|
484
|
+
guidelinePatterns,
|
|
485
|
+
messageType: "output",
|
|
486
|
+
verbose
|
|
487
|
+
});
|
|
488
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
441
489
|
const assistantContent = assistantMessages[0]?.content;
|
|
442
|
-
const
|
|
443
|
-
const
|
|
490
|
+
const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
491
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
444
492
|
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
445
493
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
446
494
|
const userFilePaths = [];
|
|
447
|
-
for (const segment of
|
|
495
|
+
for (const segment of inputSegments) {
|
|
448
496
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
449
497
|
userFilePaths.push(segment.resolvedPath);
|
|
450
498
|
}
|
|
@@ -455,16 +503,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
455
503
|
];
|
|
456
504
|
const testCase = {
|
|
457
505
|
id,
|
|
506
|
+
dataset: datasetName,
|
|
458
507
|
conversation_id: conversationId,
|
|
459
|
-
|
|
460
|
-
|
|
508
|
+
question,
|
|
509
|
+
input_segments: inputSegments,
|
|
510
|
+
output_segments: outputSegments,
|
|
461
511
|
system_message: systemMessageContent,
|
|
462
|
-
|
|
512
|
+
reference_answer: referenceAnswer,
|
|
463
513
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
464
514
|
guideline_patterns: guidelinePatterns,
|
|
465
515
|
file_paths: allFilePaths,
|
|
466
516
|
code_snippets: codeSnippets,
|
|
467
|
-
outcome,
|
|
517
|
+
expected_outcome: outcome,
|
|
468
518
|
evaluator: testCaseEvaluatorKind,
|
|
469
519
|
evaluators
|
|
470
520
|
};
|
|
@@ -500,36 +550,36 @@ ${content}`);
|
|
|
500
550
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
501
551
|
}
|
|
502
552
|
}
|
|
503
|
-
const
|
|
504
|
-
for (const segment of testCase.
|
|
553
|
+
const questionParts = [];
|
|
554
|
+
for (const segment of testCase.input_segments) {
|
|
505
555
|
const typeValue = segment.type;
|
|
506
556
|
if (typeof typeValue === "string" && typeValue === "file") {
|
|
507
557
|
const pathValue = segment.path;
|
|
508
558
|
const textValue = segment.text;
|
|
509
559
|
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
510
560
|
const body = typeof textValue === "string" ? textValue : "";
|
|
511
|
-
|
|
561
|
+
questionParts.push(`=== ${label} ===
|
|
512
562
|
${body}`);
|
|
513
563
|
continue;
|
|
514
564
|
}
|
|
515
565
|
if (typeof typeValue === "string" && typeValue === "text") {
|
|
516
566
|
const value = segment.value;
|
|
517
567
|
if (typeof value === "string") {
|
|
518
|
-
|
|
568
|
+
questionParts.push(value);
|
|
519
569
|
}
|
|
520
570
|
continue;
|
|
521
571
|
}
|
|
522
572
|
const genericValue = segment.value;
|
|
523
573
|
if (typeof genericValue === "string") {
|
|
524
|
-
|
|
574
|
+
questionParts.push(genericValue);
|
|
525
575
|
}
|
|
526
576
|
}
|
|
527
577
|
if (testCase.code_snippets.length > 0) {
|
|
528
|
-
|
|
578
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
529
579
|
}
|
|
530
|
-
const
|
|
580
|
+
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
531
581
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
532
|
-
return {
|
|
582
|
+
return { question, guidelines, systemMessage: testCase.system_message };
|
|
533
583
|
}
|
|
534
584
|
async function fileExists2(absolutePath) {
|
|
535
585
|
try {
|
|
@@ -741,7 +791,7 @@ function buildChatPrompt(request) {
|
|
|
741
791
|
${request.guidelines.trim()}`);
|
|
742
792
|
}
|
|
743
793
|
const systemContent = systemSegments.join("\n\n");
|
|
744
|
-
const userContent = request.
|
|
794
|
+
const userContent = request.question.trim();
|
|
745
795
|
const prompt = [
|
|
746
796
|
{
|
|
747
797
|
role: "system",
|
|
@@ -835,6 +885,9 @@ var AzureProvider = class {
|
|
|
835
885
|
);
|
|
836
886
|
return mapResponse(ensureChatResponse(response));
|
|
837
887
|
}
|
|
888
|
+
getAxAI() {
|
|
889
|
+
return this.ai;
|
|
890
|
+
}
|
|
838
891
|
};
|
|
839
892
|
var AnthropicProvider = class {
|
|
840
893
|
constructor(targetName, config) {
|
|
@@ -869,6 +922,9 @@ var AnthropicProvider = class {
|
|
|
869
922
|
);
|
|
870
923
|
return mapResponse(ensureChatResponse(response));
|
|
871
924
|
}
|
|
925
|
+
getAxAI() {
|
|
926
|
+
return this.ai;
|
|
927
|
+
}
|
|
872
928
|
};
|
|
873
929
|
var GeminiProvider = class {
|
|
874
930
|
constructor(targetName, config) {
|
|
@@ -902,6 +958,9 @@ var GeminiProvider = class {
|
|
|
902
958
|
);
|
|
903
959
|
return mapResponse(ensureChatResponse(response));
|
|
904
960
|
}
|
|
961
|
+
getAxAI() {
|
|
962
|
+
return this.ai;
|
|
963
|
+
}
|
|
905
964
|
};
|
|
906
965
|
|
|
907
966
|
// src/evaluation/providers/cli.ts
|
|
@@ -1030,7 +1089,7 @@ var CliProvider = class {
|
|
|
1030
1089
|
healthcheck.commandTemplate,
|
|
1031
1090
|
buildTemplateValues(
|
|
1032
1091
|
{
|
|
1033
|
-
|
|
1092
|
+
question: "",
|
|
1034
1093
|
guidelines: "",
|
|
1035
1094
|
inputFiles: [],
|
|
1036
1095
|
evalCaseId: "",
|
|
@@ -1057,7 +1116,7 @@ var CliProvider = class {
|
|
|
1057
1116
|
function buildTemplateValues(request, config) {
|
|
1058
1117
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
1059
1118
|
return {
|
|
1060
|
-
PROMPT: shellEscape(request.
|
|
1119
|
+
PROMPT: shellEscape(request.question ?? ""),
|
|
1061
1120
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
1062
1121
|
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
1063
1122
|
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
@@ -1121,6 +1180,59 @@ var import_node_os = require("os");
|
|
|
1121
1180
|
var import_node_path5 = __toESM(require("path"), 1);
|
|
1122
1181
|
var import_node_util2 = require("util");
|
|
1123
1182
|
|
|
1183
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1184
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1185
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1186
|
+
function getCodexLogStore() {
|
|
1187
|
+
const globalObject = globalThis;
|
|
1188
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1189
|
+
if (existing) {
|
|
1190
|
+
return existing;
|
|
1191
|
+
}
|
|
1192
|
+
const created = [];
|
|
1193
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1194
|
+
return created;
|
|
1195
|
+
}
|
|
1196
|
+
function getSubscriberStore() {
|
|
1197
|
+
const globalObject = globalThis;
|
|
1198
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1199
|
+
if (existing) {
|
|
1200
|
+
return existing;
|
|
1201
|
+
}
|
|
1202
|
+
const created = /* @__PURE__ */ new Set();
|
|
1203
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1204
|
+
return created;
|
|
1205
|
+
}
|
|
1206
|
+
function notifySubscribers(entry) {
|
|
1207
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1208
|
+
for (const listener of subscribers) {
|
|
1209
|
+
try {
|
|
1210
|
+
listener(entry);
|
|
1211
|
+
} catch (error) {
|
|
1212
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1213
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1214
|
+
}
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
function recordCodexLogEntry(entry) {
|
|
1218
|
+
getCodexLogStore().push(entry);
|
|
1219
|
+
notifySubscribers(entry);
|
|
1220
|
+
}
|
|
1221
|
+
function consumeCodexLogEntries() {
|
|
1222
|
+
const store = getCodexLogStore();
|
|
1223
|
+
if (store.length === 0) {
|
|
1224
|
+
return [];
|
|
1225
|
+
}
|
|
1226
|
+
return store.splice(0, store.length);
|
|
1227
|
+
}
|
|
1228
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1229
|
+
const store = getSubscriberStore();
|
|
1230
|
+
store.add(listener);
|
|
1231
|
+
return () => {
|
|
1232
|
+
store.delete(listener);
|
|
1233
|
+
};
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1124
1236
|
// src/evaluation/providers/preread.ts
|
|
1125
1237
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
1126
1238
|
function buildPromptDocument(request, inputFiles, options) {
|
|
@@ -1138,7 +1250,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
1138
1250
|
if (prereadBlock.length > 0) {
|
|
1139
1251
|
parts.push("\n", prereadBlock);
|
|
1140
1252
|
}
|
|
1141
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
1253
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
1142
1254
|
return parts.join("\n").trim();
|
|
1143
1255
|
}
|
|
1144
1256
|
function normalizeInputFiles2(inputFiles) {
|
|
@@ -1418,7 +1530,12 @@ var CodexProvider = class {
|
|
|
1418
1530
|
attempt: request.attempt,
|
|
1419
1531
|
format: this.config.logFormat ?? "summary"
|
|
1420
1532
|
});
|
|
1421
|
-
|
|
1533
|
+
recordCodexLogEntry({
|
|
1534
|
+
filePath,
|
|
1535
|
+
targetName: this.targetName,
|
|
1536
|
+
evalCaseId: request.evalCaseId,
|
|
1537
|
+
attempt: request.attempt
|
|
1538
|
+
});
|
|
1422
1539
|
return logger;
|
|
1423
1540
|
} catch (error) {
|
|
1424
1541
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -1950,7 +2067,7 @@ var MockProvider = class {
|
|
|
1950
2067
|
return {
|
|
1951
2068
|
text: this.cannedResponse,
|
|
1952
2069
|
raw: {
|
|
1953
|
-
|
|
2070
|
+
question: request.question,
|
|
1954
2071
|
guidelines: request.guidelines
|
|
1955
2072
|
}
|
|
1956
2073
|
};
|
|
@@ -2566,7 +2683,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
2566
2683
|
if (prereadBlock.length > 0) {
|
|
2567
2684
|
parts.push("\n", prereadBlock);
|
|
2568
2685
|
}
|
|
2569
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
2686
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2570
2687
|
return parts.join("\n").trim();
|
|
2571
2688
|
}
|
|
2572
2689
|
function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
@@ -2826,14 +2943,29 @@ var LlmJudgeEvaluator = class {
|
|
|
2826
2943
|
if (!judgeProvider) {
|
|
2827
2944
|
throw new Error("No judge provider available for LLM grading");
|
|
2828
2945
|
}
|
|
2829
|
-
|
|
2830
|
-
|
|
2946
|
+
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2947
|
+
}
|
|
2948
|
+
async evaluateWithPrompt(context, judgeProvider) {
|
|
2949
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2950
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2951
|
+
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2952
|
+
const variables = {
|
|
2953
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2954
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2955
|
+
candidate_answer: context.candidate,
|
|
2956
|
+
reference_answer: context.evalCase.reference_answer,
|
|
2957
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
2958
|
+
question: context.evalCase.question
|
|
2959
|
+
};
|
|
2960
|
+
prompt = substituteVariables(systemPrompt, variables);
|
|
2961
|
+
systemPrompt = QUALITY_SYSTEM_PROMPT;
|
|
2962
|
+
}
|
|
2831
2963
|
const metadata = {
|
|
2832
2964
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2833
2965
|
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2834
2966
|
};
|
|
2835
2967
|
const response = await judgeProvider.invoke({
|
|
2836
|
-
prompt,
|
|
2968
|
+
question: prompt,
|
|
2837
2969
|
metadata,
|
|
2838
2970
|
evalCaseId: context.evalCase.id,
|
|
2839
2971
|
attempt: context.attempt,
|
|
@@ -2845,6 +2977,7 @@ var LlmJudgeEvaluator = class {
|
|
|
2845
2977
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2846
2978
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2847
2979
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
2980
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
2848
2981
|
const evaluatorRawRequest = {
|
|
2849
2982
|
id: (0, import_node_crypto2.randomUUID)(),
|
|
2850
2983
|
provider: judgeProvider.id,
|
|
@@ -2857,16 +2990,16 @@ var LlmJudgeEvaluator = class {
|
|
|
2857
2990
|
score,
|
|
2858
2991
|
hits,
|
|
2859
2992
|
misses,
|
|
2860
|
-
expectedAspectCount
|
|
2993
|
+
expectedAspectCount,
|
|
2861
2994
|
reasoning,
|
|
2862
2995
|
evaluatorRawRequest
|
|
2863
2996
|
};
|
|
2864
2997
|
}
|
|
2865
2998
|
};
|
|
2866
2999
|
var QUALITY_SYSTEM_PROMPT = [
|
|
2867
|
-
"You are an expert evaluator. Your goal is to grade the
|
|
3000
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2868
3001
|
"",
|
|
2869
|
-
"Use the reference_answer as a gold standard for a high-quality response. The
|
|
3002
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
|
|
2870
3003
|
"",
|
|
2871
3004
|
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2872
3005
|
"",
|
|
@@ -2879,18 +3012,18 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
2879
3012
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2880
3013
|
"}"
|
|
2881
3014
|
].join("\n");
|
|
2882
|
-
function buildQualityPrompt(
|
|
3015
|
+
function buildQualityPrompt(evalCase, candidate) {
|
|
2883
3016
|
const parts = [
|
|
2884
3017
|
"[[ ## expected_outcome ## ]]",
|
|
2885
|
-
|
|
3018
|
+
evalCase.expected_outcome.trim(),
|
|
2886
3019
|
"",
|
|
2887
|
-
"[[ ##
|
|
2888
|
-
|
|
3020
|
+
"[[ ## question ## ]]",
|
|
3021
|
+
evalCase.question.trim(),
|
|
2889
3022
|
"",
|
|
2890
3023
|
"[[ ## reference_answer ## ]]",
|
|
2891
|
-
|
|
3024
|
+
evalCase.reference_answer.trim(),
|
|
2892
3025
|
"",
|
|
2893
|
-
"[[ ##
|
|
3026
|
+
"[[ ## candidate_answer ## ]]",
|
|
2894
3027
|
candidate.trim(),
|
|
2895
3028
|
"",
|
|
2896
3029
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
@@ -2990,14 +3123,14 @@ var CodeEvaluator = class {
|
|
|
2990
3123
|
async evaluate(context) {
|
|
2991
3124
|
const inputPayload = JSON.stringify(
|
|
2992
3125
|
{
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
3126
|
+
question: context.evalCase.question,
|
|
3127
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
3128
|
+
reference_answer: context.evalCase.reference_answer,
|
|
3129
|
+
candidate_answer: context.candidate,
|
|
2997
3130
|
system_message: context.promptInputs.systemMessage ?? "",
|
|
2998
3131
|
guideline_paths: context.evalCase.guideline_paths,
|
|
2999
|
-
|
|
3000
|
-
|
|
3132
|
+
input_files: context.evalCase.file_paths,
|
|
3133
|
+
input_segments: context.evalCase.input_segments
|
|
3001
3134
|
},
|
|
3002
3135
|
null,
|
|
3003
3136
|
2
|
|
@@ -3083,6 +3216,14 @@ function parseJsonSafe(payload) {
|
|
|
3083
3216
|
return void 0;
|
|
3084
3217
|
}
|
|
3085
3218
|
}
|
|
3219
|
+
function hasTemplateVariables(text) {
|
|
3220
|
+
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
3221
|
+
}
|
|
3222
|
+
function substituteVariables(template, variables) {
|
|
3223
|
+
return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
|
|
3224
|
+
return variables[varName] ?? match;
|
|
3225
|
+
});
|
|
3226
|
+
}
|
|
3086
3227
|
|
|
3087
3228
|
// src/evaluation/orchestrator.ts
|
|
3088
3229
|
var import_node_crypto3 = require("crypto");
|
|
@@ -3439,7 +3580,7 @@ async function runBatchEvaluation(options) {
|
|
|
3439
3580
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
3440
3581
|
const promptInputs = promptInputsList[index];
|
|
3441
3582
|
return {
|
|
3442
|
-
|
|
3583
|
+
question: promptInputs.question,
|
|
3443
3584
|
guidelines: promptInputs.guidelines,
|
|
3444
3585
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3445
3586
|
inputFiles: evalCase.file_paths,
|
|
@@ -3626,18 +3767,19 @@ async function evaluateCandidate(options) {
|
|
|
3626
3767
|
});
|
|
3627
3768
|
const completedAt = nowFn();
|
|
3628
3769
|
const rawRequest = {
|
|
3629
|
-
|
|
3770
|
+
question: promptInputs.question,
|
|
3630
3771
|
guidelines: promptInputs.guidelines,
|
|
3631
3772
|
guideline_paths: evalCase.guideline_paths,
|
|
3632
3773
|
system_message: promptInputs.systemMessage ?? ""
|
|
3633
3774
|
};
|
|
3634
3775
|
return {
|
|
3635
3776
|
eval_id: evalCase.id,
|
|
3777
|
+
dataset: evalCase.dataset,
|
|
3636
3778
|
conversation_id: evalCase.conversation_id,
|
|
3637
3779
|
score: score.score,
|
|
3638
3780
|
hits: score.hits,
|
|
3639
3781
|
misses: score.misses,
|
|
3640
|
-
|
|
3782
|
+
candidate_answer: candidate,
|
|
3641
3783
|
expected_aspect_count: score.expectedAspectCount,
|
|
3642
3784
|
target: target.name,
|
|
3643
3785
|
timestamp: completedAt.toISOString(),
|
|
@@ -3809,7 +3951,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3809
3951
|
async function resolveCustomPrompt(config) {
|
|
3810
3952
|
if (config.promptPath) {
|
|
3811
3953
|
try {
|
|
3812
|
-
return await (
|
|
3954
|
+
return await readTextFile(config.promptPath);
|
|
3813
3955
|
} catch (error) {
|
|
3814
3956
|
const message = error instanceof Error ? error.message : String(error);
|
|
3815
3957
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
@@ -3847,7 +3989,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
|
3847
3989
|
await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
|
|
3848
3990
|
const payload = {
|
|
3849
3991
|
eval_id: evalCase.id,
|
|
3850
|
-
|
|
3992
|
+
question: promptInputs.question,
|
|
3851
3993
|
guidelines: promptInputs.guidelines,
|
|
3852
3994
|
guideline_paths: evalCase.guideline_paths
|
|
3853
3995
|
};
|
|
@@ -3869,7 +4011,7 @@ async function invokeProvider(provider, options) {
|
|
|
3869
4011
|
}
|
|
3870
4012
|
try {
|
|
3871
4013
|
return await provider.invoke({
|
|
3872
|
-
|
|
4014
|
+
question: promptInputs.question,
|
|
3873
4015
|
guidelines: promptInputs.guidelines,
|
|
3874
4016
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3875
4017
|
inputFiles: evalCase.file_paths,
|
|
@@ -3889,7 +4031,7 @@ async function invokeProvider(provider, options) {
|
|
|
3889
4031
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
3890
4032
|
const message = error instanceof Error ? error.message : String(error);
|
|
3891
4033
|
const rawRequest = {
|
|
3892
|
-
|
|
4034
|
+
question: promptInputs.question,
|
|
3893
4035
|
guidelines: promptInputs.guidelines,
|
|
3894
4036
|
guideline_paths: evalCase.guideline_paths,
|
|
3895
4037
|
system_message: promptInputs.systemMessage ?? "",
|
|
@@ -3897,11 +4039,12 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
3897
4039
|
};
|
|
3898
4040
|
return {
|
|
3899
4041
|
eval_id: evalCase.id,
|
|
4042
|
+
dataset: evalCase.dataset,
|
|
3900
4043
|
conversation_id: evalCase.conversation_id,
|
|
3901
4044
|
score: 0,
|
|
3902
4045
|
hits: [],
|
|
3903
4046
|
misses: [`Error: ${message}`],
|
|
3904
|
-
|
|
4047
|
+
candidate_answer: `Error occurred: ${message}`,
|
|
3905
4048
|
expected_aspect_count: 0,
|
|
3906
4049
|
target: targetName,
|
|
3907
4050
|
timestamp: timestamp.toISOString(),
|
|
@@ -3914,7 +4057,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
3914
4057
|
hash.update(provider.id);
|
|
3915
4058
|
hash.update(target.name);
|
|
3916
4059
|
hash.update(evalCase.id);
|
|
3917
|
-
hash.update(promptInputs.
|
|
4060
|
+
hash.update(promptInputs.question);
|
|
3918
4061
|
hash.update(promptInputs.guidelines);
|
|
3919
4062
|
hash.update(promptInputs.systemMessage ?? "");
|
|
3920
4063
|
return hash.digest("hex");
|
|
@@ -3947,6 +4090,7 @@ function createAgentKernel() {
|
|
|
3947
4090
|
buildDirectoryChain,
|
|
3948
4091
|
buildPromptInputs,
|
|
3949
4092
|
buildSearchRoots,
|
|
4093
|
+
consumeCodexLogEntries,
|
|
3950
4094
|
createAgentKernel,
|
|
3951
4095
|
createProvider,
|
|
3952
4096
|
ensureVSCodeSubagents,
|
|
@@ -3963,10 +4107,12 @@ function createAgentKernel() {
|
|
|
3963
4107
|
listTargetNames,
|
|
3964
4108
|
loadEvalCases,
|
|
3965
4109
|
readTargetDefinitions,
|
|
4110
|
+
readTextFile,
|
|
3966
4111
|
resolveAndCreateProvider,
|
|
3967
4112
|
resolveFileReference,
|
|
3968
4113
|
resolveTargetDefinition,
|
|
3969
4114
|
runEvalCase,
|
|
3970
|
-
runEvaluation
|
|
4115
|
+
runEvaluation,
|
|
4116
|
+
subscribeToCodexLogEntries
|
|
3971
4117
|
});
|
|
3972
4118
|
//# sourceMappingURL=index.cjs.map
|