@agentv/core 0.5.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-NL7K4CAK.js → chunk-L7I5UTJU.js} +7 -2
- package/dist/chunk-L7I5UTJU.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +260 -114
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +34 -10
- package/dist/index.d.ts +34 -10
- package/dist/index.js +255 -115
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-NL7K4CAK.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -4,8 +4,9 @@ import {
|
|
|
4
4
|
buildSearchRoots,
|
|
5
5
|
fileExists,
|
|
6
6
|
findGitRoot,
|
|
7
|
+
readTextFile,
|
|
7
8
|
resolveFileReference
|
|
8
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-L7I5UTJU.js";
|
|
9
10
|
|
|
10
11
|
// src/evaluation/types.ts
|
|
11
12
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -133,6 +134,87 @@ function extractCodeBlocks(segments) {
|
|
|
133
134
|
}
|
|
134
135
|
return codeBlocks;
|
|
135
136
|
}
|
|
137
|
+
async function processMessages(options) {
|
|
138
|
+
const {
|
|
139
|
+
messages,
|
|
140
|
+
searchRoots,
|
|
141
|
+
repoRootPath,
|
|
142
|
+
guidelinePatterns,
|
|
143
|
+
guidelinePaths,
|
|
144
|
+
textParts,
|
|
145
|
+
messageType,
|
|
146
|
+
verbose
|
|
147
|
+
} = options;
|
|
148
|
+
const segments = [];
|
|
149
|
+
for (const message of messages) {
|
|
150
|
+
const content = message.content;
|
|
151
|
+
if (typeof content === "string") {
|
|
152
|
+
segments.push({ type: "text", value: content });
|
|
153
|
+
if (textParts) {
|
|
154
|
+
textParts.push(content);
|
|
155
|
+
}
|
|
156
|
+
continue;
|
|
157
|
+
}
|
|
158
|
+
for (const rawSegment of content) {
|
|
159
|
+
if (!isJsonObject(rawSegment)) {
|
|
160
|
+
continue;
|
|
161
|
+
}
|
|
162
|
+
const segmentType = asString(rawSegment.type);
|
|
163
|
+
if (segmentType === "file") {
|
|
164
|
+
const rawValue = asString(rawSegment.value);
|
|
165
|
+
if (!rawValue) {
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
169
|
+
rawValue,
|
|
170
|
+
searchRoots
|
|
171
|
+
);
|
|
172
|
+
if (!resolvedPath) {
|
|
173
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
174
|
+
const context = messageType === "input" ? "" : " in expected_messages";
|
|
175
|
+
logWarning(`File not found${context}: ${displayPath}`, attempts);
|
|
176
|
+
continue;
|
|
177
|
+
}
|
|
178
|
+
try {
|
|
179
|
+
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
180
|
+
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
181
|
+
const relativeToRepo = path.relative(repoRootPath, resolvedPath);
|
|
182
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
183
|
+
guidelinePaths.push(path.resolve(resolvedPath));
|
|
184
|
+
if (verbose) {
|
|
185
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
186
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
187
|
+
}
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
segments.push({
|
|
192
|
+
type: "file",
|
|
193
|
+
path: displayPath,
|
|
194
|
+
text: fileContent,
|
|
195
|
+
resolvedPath: path.resolve(resolvedPath)
|
|
196
|
+
});
|
|
197
|
+
if (verbose) {
|
|
198
|
+
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
199
|
+
console.log(` ${label} Found: ${displayPath}`);
|
|
200
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
201
|
+
}
|
|
202
|
+
} catch (error) {
|
|
203
|
+
const context = messageType === "input" ? "" : " expected output";
|
|
204
|
+
logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
205
|
+
}
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
const clonedSegment = cloneJsonObject(rawSegment);
|
|
209
|
+
segments.push(clonedSegment);
|
|
210
|
+
const inlineValue = clonedSegment.value;
|
|
211
|
+
if (typeof inlineValue === "string" && textParts) {
|
|
212
|
+
textParts.push(inlineValue);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
return segments;
|
|
217
|
+
}
|
|
136
218
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
137
219
|
const verbose = options?.verbose ?? false;
|
|
138
220
|
const absoluteTestPath = path.resolve(evalFilePath);
|
|
@@ -149,6 +231,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
149
231
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
150
232
|
}
|
|
151
233
|
const suite = parsed;
|
|
234
|
+
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
235
|
+
const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
236
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
152
237
|
const schema = suite.$schema;
|
|
153
238
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
154
239
|
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
@@ -215,77 +300,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
215
300
|
}
|
|
216
301
|
}
|
|
217
302
|
}
|
|
218
|
-
const userSegments = [];
|
|
219
303
|
const guidelinePaths = [];
|
|
220
|
-
const
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
searchRoots
|
|
241
|
-
);
|
|
242
|
-
if (!resolvedPath) {
|
|
243
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
244
|
-
logWarning(`File not found: ${displayPath}`, attempts);
|
|
245
|
-
continue;
|
|
246
|
-
}
|
|
247
|
-
try {
|
|
248
|
-
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
249
|
-
const relativeToRepo = path.relative(repoRootPath, resolvedPath);
|
|
250
|
-
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
251
|
-
guidelinePaths.push(path.resolve(resolvedPath));
|
|
252
|
-
if (verbose) {
|
|
253
|
-
console.log(` [Guideline] Found: ${displayPath}`);
|
|
254
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
255
|
-
}
|
|
256
|
-
} else {
|
|
257
|
-
userSegments.push({
|
|
258
|
-
type: "file",
|
|
259
|
-
path: displayPath,
|
|
260
|
-
text: fileContent,
|
|
261
|
-
resolvedPath: path.resolve(resolvedPath)
|
|
262
|
-
});
|
|
263
|
-
if (verbose) {
|
|
264
|
-
console.log(` [File] Found: ${displayPath}`);
|
|
265
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
} catch (error) {
|
|
269
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
270
|
-
}
|
|
271
|
-
continue;
|
|
272
|
-
}
|
|
273
|
-
const clonedSegment = cloneJsonObject(rawSegment);
|
|
274
|
-
userSegments.push(clonedSegment);
|
|
275
|
-
const inlineValue = clonedSegment.value;
|
|
276
|
-
if (typeof inlineValue === "string") {
|
|
277
|
-
userTextParts.push(inlineValue);
|
|
278
|
-
}
|
|
279
|
-
}
|
|
280
|
-
}
|
|
281
|
-
const codeSnippets = extractCodeBlocks(userSegments);
|
|
304
|
+
const inputTextParts = [];
|
|
305
|
+
const inputSegments = await processMessages({
|
|
306
|
+
messages: userMessages,
|
|
307
|
+
searchRoots,
|
|
308
|
+
repoRootPath,
|
|
309
|
+
guidelinePatterns,
|
|
310
|
+
guidelinePaths,
|
|
311
|
+
textParts: inputTextParts,
|
|
312
|
+
messageType: "input",
|
|
313
|
+
verbose
|
|
314
|
+
});
|
|
315
|
+
const outputSegments = await processMessages({
|
|
316
|
+
messages: assistantMessages,
|
|
317
|
+
searchRoots,
|
|
318
|
+
repoRootPath,
|
|
319
|
+
guidelinePatterns,
|
|
320
|
+
messageType: "output",
|
|
321
|
+
verbose
|
|
322
|
+
});
|
|
323
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
282
324
|
const assistantContent = assistantMessages[0]?.content;
|
|
283
|
-
const
|
|
284
|
-
const
|
|
325
|
+
const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
326
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
285
327
|
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
286
328
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
287
329
|
const userFilePaths = [];
|
|
288
|
-
for (const segment of
|
|
330
|
+
for (const segment of inputSegments) {
|
|
289
331
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
290
332
|
userFilePaths.push(segment.resolvedPath);
|
|
291
333
|
}
|
|
@@ -296,16 +338,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
296
338
|
];
|
|
297
339
|
const testCase = {
|
|
298
340
|
id,
|
|
341
|
+
dataset: datasetName,
|
|
299
342
|
conversation_id: conversationId,
|
|
300
|
-
|
|
301
|
-
|
|
343
|
+
question,
|
|
344
|
+
input_segments: inputSegments,
|
|
345
|
+
output_segments: outputSegments,
|
|
302
346
|
system_message: systemMessageContent,
|
|
303
|
-
|
|
347
|
+
reference_answer: referenceAnswer,
|
|
304
348
|
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
305
349
|
guideline_patterns: guidelinePatterns,
|
|
306
350
|
file_paths: allFilePaths,
|
|
307
351
|
code_snippets: codeSnippets,
|
|
308
|
-
outcome,
|
|
352
|
+
expected_outcome: outcome,
|
|
309
353
|
evaluator: testCaseEvaluatorKind,
|
|
310
354
|
evaluators
|
|
311
355
|
};
|
|
@@ -341,36 +385,36 @@ ${content}`);
|
|
|
341
385
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
342
386
|
}
|
|
343
387
|
}
|
|
344
|
-
const
|
|
345
|
-
for (const segment of testCase.
|
|
388
|
+
const questionParts = [];
|
|
389
|
+
for (const segment of testCase.input_segments) {
|
|
346
390
|
const typeValue = segment.type;
|
|
347
391
|
if (typeof typeValue === "string" && typeValue === "file") {
|
|
348
392
|
const pathValue = segment.path;
|
|
349
393
|
const textValue = segment.text;
|
|
350
394
|
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
351
395
|
const body = typeof textValue === "string" ? textValue : "";
|
|
352
|
-
|
|
396
|
+
questionParts.push(`=== ${label} ===
|
|
353
397
|
${body}`);
|
|
354
398
|
continue;
|
|
355
399
|
}
|
|
356
400
|
if (typeof typeValue === "string" && typeValue === "text") {
|
|
357
401
|
const value = segment.value;
|
|
358
402
|
if (typeof value === "string") {
|
|
359
|
-
|
|
403
|
+
questionParts.push(value);
|
|
360
404
|
}
|
|
361
405
|
continue;
|
|
362
406
|
}
|
|
363
407
|
const genericValue = segment.value;
|
|
364
408
|
if (typeof genericValue === "string") {
|
|
365
|
-
|
|
409
|
+
questionParts.push(genericValue);
|
|
366
410
|
}
|
|
367
411
|
}
|
|
368
412
|
if (testCase.code_snippets.length > 0) {
|
|
369
|
-
|
|
413
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
370
414
|
}
|
|
371
|
-
const
|
|
415
|
+
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
372
416
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
373
|
-
return {
|
|
417
|
+
return { question, guidelines, systemMessage: testCase.system_message };
|
|
374
418
|
}
|
|
375
419
|
async function fileExists2(absolutePath) {
|
|
376
420
|
try {
|
|
@@ -582,7 +626,7 @@ function buildChatPrompt(request) {
|
|
|
582
626
|
${request.guidelines.trim()}`);
|
|
583
627
|
}
|
|
584
628
|
const systemContent = systemSegments.join("\n\n");
|
|
585
|
-
const userContent = request.
|
|
629
|
+
const userContent = request.question.trim();
|
|
586
630
|
const prompt = [
|
|
587
631
|
{
|
|
588
632
|
role: "system",
|
|
@@ -676,6 +720,9 @@ var AzureProvider = class {
|
|
|
676
720
|
);
|
|
677
721
|
return mapResponse(ensureChatResponse(response));
|
|
678
722
|
}
|
|
723
|
+
getAxAI() {
|
|
724
|
+
return this.ai;
|
|
725
|
+
}
|
|
679
726
|
};
|
|
680
727
|
var AnthropicProvider = class {
|
|
681
728
|
constructor(targetName, config) {
|
|
@@ -710,6 +757,9 @@ var AnthropicProvider = class {
|
|
|
710
757
|
);
|
|
711
758
|
return mapResponse(ensureChatResponse(response));
|
|
712
759
|
}
|
|
760
|
+
getAxAI() {
|
|
761
|
+
return this.ai;
|
|
762
|
+
}
|
|
713
763
|
};
|
|
714
764
|
var GeminiProvider = class {
|
|
715
765
|
constructor(targetName, config) {
|
|
@@ -743,6 +793,9 @@ var GeminiProvider = class {
|
|
|
743
793
|
);
|
|
744
794
|
return mapResponse(ensureChatResponse(response));
|
|
745
795
|
}
|
|
796
|
+
getAxAI() {
|
|
797
|
+
return this.ai;
|
|
798
|
+
}
|
|
746
799
|
};
|
|
747
800
|
|
|
748
801
|
// src/evaluation/providers/cli.ts
|
|
@@ -871,7 +924,7 @@ var CliProvider = class {
|
|
|
871
924
|
healthcheck.commandTemplate,
|
|
872
925
|
buildTemplateValues(
|
|
873
926
|
{
|
|
874
|
-
|
|
927
|
+
question: "",
|
|
875
928
|
guidelines: "",
|
|
876
929
|
inputFiles: [],
|
|
877
930
|
evalCaseId: "",
|
|
@@ -898,7 +951,7 @@ var CliProvider = class {
|
|
|
898
951
|
function buildTemplateValues(request, config) {
|
|
899
952
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
900
953
|
return {
|
|
901
|
-
PROMPT: shellEscape(request.
|
|
954
|
+
PROMPT: shellEscape(request.question ?? ""),
|
|
902
955
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
903
956
|
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
904
957
|
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
@@ -962,6 +1015,59 @@ import { tmpdir } from "node:os";
|
|
|
962
1015
|
import path4 from "node:path";
|
|
963
1016
|
import { promisify as promisify2 } from "node:util";
|
|
964
1017
|
|
|
1018
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1019
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1020
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1021
|
+
function getCodexLogStore() {
|
|
1022
|
+
const globalObject = globalThis;
|
|
1023
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1024
|
+
if (existing) {
|
|
1025
|
+
return existing;
|
|
1026
|
+
}
|
|
1027
|
+
const created = [];
|
|
1028
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1029
|
+
return created;
|
|
1030
|
+
}
|
|
1031
|
+
function getSubscriberStore() {
|
|
1032
|
+
const globalObject = globalThis;
|
|
1033
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1034
|
+
if (existing) {
|
|
1035
|
+
return existing;
|
|
1036
|
+
}
|
|
1037
|
+
const created = /* @__PURE__ */ new Set();
|
|
1038
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1039
|
+
return created;
|
|
1040
|
+
}
|
|
1041
|
+
function notifySubscribers(entry) {
|
|
1042
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1043
|
+
for (const listener of subscribers) {
|
|
1044
|
+
try {
|
|
1045
|
+
listener(entry);
|
|
1046
|
+
} catch (error) {
|
|
1047
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1048
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
function recordCodexLogEntry(entry) {
|
|
1053
|
+
getCodexLogStore().push(entry);
|
|
1054
|
+
notifySubscribers(entry);
|
|
1055
|
+
}
|
|
1056
|
+
function consumeCodexLogEntries() {
|
|
1057
|
+
const store = getCodexLogStore();
|
|
1058
|
+
if (store.length === 0) {
|
|
1059
|
+
return [];
|
|
1060
|
+
}
|
|
1061
|
+
return store.splice(0, store.length);
|
|
1062
|
+
}
|
|
1063
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1064
|
+
const store = getSubscriberStore();
|
|
1065
|
+
store.add(listener);
|
|
1066
|
+
return () => {
|
|
1067
|
+
store.delete(listener);
|
|
1068
|
+
};
|
|
1069
|
+
}
|
|
1070
|
+
|
|
965
1071
|
// src/evaluation/providers/preread.ts
|
|
966
1072
|
import path3 from "node:path";
|
|
967
1073
|
function buildPromptDocument(request, inputFiles, options) {
|
|
@@ -979,7 +1085,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
979
1085
|
if (prereadBlock.length > 0) {
|
|
980
1086
|
parts.push("\n", prereadBlock);
|
|
981
1087
|
}
|
|
982
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
1088
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
983
1089
|
return parts.join("\n").trim();
|
|
984
1090
|
}
|
|
985
1091
|
function normalizeInputFiles2(inputFiles) {
|
|
@@ -1259,7 +1365,12 @@ var CodexProvider = class {
|
|
|
1259
1365
|
attempt: request.attempt,
|
|
1260
1366
|
format: this.config.logFormat ?? "summary"
|
|
1261
1367
|
});
|
|
1262
|
-
|
|
1368
|
+
recordCodexLogEntry({
|
|
1369
|
+
filePath,
|
|
1370
|
+
targetName: this.targetName,
|
|
1371
|
+
evalCaseId: request.evalCaseId,
|
|
1372
|
+
attempt: request.attempt
|
|
1373
|
+
});
|
|
1263
1374
|
return logger;
|
|
1264
1375
|
} catch (error) {
|
|
1265
1376
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -1791,7 +1902,7 @@ var MockProvider = class {
|
|
|
1791
1902
|
return {
|
|
1792
1903
|
text: this.cannedResponse,
|
|
1793
1904
|
raw: {
|
|
1794
|
-
|
|
1905
|
+
question: request.question,
|
|
1795
1906
|
guidelines: request.guidelines
|
|
1796
1907
|
}
|
|
1797
1908
|
};
|
|
@@ -2407,7 +2518,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
2407
2518
|
if (prereadBlock.length > 0) {
|
|
2408
2519
|
parts.push("\n", prereadBlock);
|
|
2409
2520
|
}
|
|
2410
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
2521
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2411
2522
|
return parts.join("\n").trim();
|
|
2412
2523
|
}
|
|
2413
2524
|
function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
@@ -2662,14 +2773,29 @@ var LlmJudgeEvaluator = class {
|
|
|
2662
2773
|
if (!judgeProvider) {
|
|
2663
2774
|
throw new Error("No judge provider available for LLM grading");
|
|
2664
2775
|
}
|
|
2665
|
-
|
|
2666
|
-
|
|
2776
|
+
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2777
|
+
}
|
|
2778
|
+
async evaluateWithPrompt(context, judgeProvider) {
|
|
2779
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2780
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2781
|
+
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2782
|
+
const variables = {
|
|
2783
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2784
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2785
|
+
candidate_answer: context.candidate,
|
|
2786
|
+
reference_answer: context.evalCase.reference_answer,
|
|
2787
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
2788
|
+
question: context.evalCase.question
|
|
2789
|
+
};
|
|
2790
|
+
prompt = substituteVariables(systemPrompt, variables);
|
|
2791
|
+
systemPrompt = QUALITY_SYSTEM_PROMPT;
|
|
2792
|
+
}
|
|
2667
2793
|
const metadata = {
|
|
2668
2794
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2669
2795
|
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2670
2796
|
};
|
|
2671
2797
|
const response = await judgeProvider.invoke({
|
|
2672
|
-
prompt,
|
|
2798
|
+
question: prompt,
|
|
2673
2799
|
metadata,
|
|
2674
2800
|
evalCaseId: context.evalCase.id,
|
|
2675
2801
|
attempt: context.attempt,
|
|
@@ -2681,6 +2807,7 @@ var LlmJudgeEvaluator = class {
|
|
|
2681
2807
|
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2682
2808
|
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
2683
2809
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
2810
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
2684
2811
|
const evaluatorRawRequest = {
|
|
2685
2812
|
id: randomUUID2(),
|
|
2686
2813
|
provider: judgeProvider.id,
|
|
@@ -2693,16 +2820,16 @@ var LlmJudgeEvaluator = class {
|
|
|
2693
2820
|
score,
|
|
2694
2821
|
hits,
|
|
2695
2822
|
misses,
|
|
2696
|
-
expectedAspectCount
|
|
2823
|
+
expectedAspectCount,
|
|
2697
2824
|
reasoning,
|
|
2698
2825
|
evaluatorRawRequest
|
|
2699
2826
|
};
|
|
2700
2827
|
}
|
|
2701
2828
|
};
|
|
2702
2829
|
var QUALITY_SYSTEM_PROMPT = [
|
|
2703
|
-
"You are an expert evaluator. Your goal is to grade the
|
|
2830
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2704
2831
|
"",
|
|
2705
|
-
"Use the reference_answer as a gold standard for a high-quality response. The
|
|
2832
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
|
|
2706
2833
|
"",
|
|
2707
2834
|
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2708
2835
|
"",
|
|
@@ -2715,18 +2842,18 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
2715
2842
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2716
2843
|
"}"
|
|
2717
2844
|
].join("\n");
|
|
2718
|
-
function buildQualityPrompt(
|
|
2845
|
+
function buildQualityPrompt(evalCase, candidate) {
|
|
2719
2846
|
const parts = [
|
|
2720
2847
|
"[[ ## expected_outcome ## ]]",
|
|
2721
|
-
|
|
2848
|
+
evalCase.expected_outcome.trim(),
|
|
2722
2849
|
"",
|
|
2723
|
-
"[[ ##
|
|
2724
|
-
|
|
2850
|
+
"[[ ## question ## ]]",
|
|
2851
|
+
evalCase.question.trim(),
|
|
2725
2852
|
"",
|
|
2726
2853
|
"[[ ## reference_answer ## ]]",
|
|
2727
|
-
|
|
2854
|
+
evalCase.reference_answer.trim(),
|
|
2728
2855
|
"",
|
|
2729
|
-
"[[ ##
|
|
2856
|
+
"[[ ## candidate_answer ## ]]",
|
|
2730
2857
|
candidate.trim(),
|
|
2731
2858
|
"",
|
|
2732
2859
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
@@ -2826,14 +2953,14 @@ var CodeEvaluator = class {
|
|
|
2826
2953
|
async evaluate(context) {
|
|
2827
2954
|
const inputPayload = JSON.stringify(
|
|
2828
2955
|
{
|
|
2829
|
-
|
|
2830
|
-
|
|
2831
|
-
|
|
2832
|
-
|
|
2956
|
+
question: context.evalCase.question,
|
|
2957
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
2958
|
+
reference_answer: context.evalCase.reference_answer,
|
|
2959
|
+
candidate_answer: context.candidate,
|
|
2833
2960
|
system_message: context.promptInputs.systemMessage ?? "",
|
|
2834
2961
|
guideline_paths: context.evalCase.guideline_paths,
|
|
2835
|
-
|
|
2836
|
-
|
|
2962
|
+
input_files: context.evalCase.file_paths,
|
|
2963
|
+
input_segments: context.evalCase.input_segments
|
|
2837
2964
|
},
|
|
2838
2965
|
null,
|
|
2839
2966
|
2
|
|
@@ -2919,10 +3046,18 @@ function parseJsonSafe(payload) {
|
|
|
2919
3046
|
return void 0;
|
|
2920
3047
|
}
|
|
2921
3048
|
}
|
|
3049
|
+
function hasTemplateVariables(text) {
|
|
3050
|
+
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
3051
|
+
}
|
|
3052
|
+
function substituteVariables(template, variables) {
|
|
3053
|
+
return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
|
|
3054
|
+
return variables[varName] ?? match;
|
|
3055
|
+
});
|
|
3056
|
+
}
|
|
2922
3057
|
|
|
2923
3058
|
// src/evaluation/orchestrator.ts
|
|
2924
3059
|
import { createHash, randomUUID as randomUUID3 } from "node:crypto";
|
|
2925
|
-
import { mkdir as mkdir2,
|
|
3060
|
+
import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
|
|
2926
3061
|
import path7 from "node:path";
|
|
2927
3062
|
|
|
2928
3063
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
@@ -3275,7 +3410,7 @@ async function runBatchEvaluation(options) {
|
|
|
3275
3410
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
3276
3411
|
const promptInputs = promptInputsList[index];
|
|
3277
3412
|
return {
|
|
3278
|
-
|
|
3413
|
+
question: promptInputs.question,
|
|
3279
3414
|
guidelines: promptInputs.guidelines,
|
|
3280
3415
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3281
3416
|
inputFiles: evalCase.file_paths,
|
|
@@ -3462,18 +3597,19 @@ async function evaluateCandidate(options) {
|
|
|
3462
3597
|
});
|
|
3463
3598
|
const completedAt = nowFn();
|
|
3464
3599
|
const rawRequest = {
|
|
3465
|
-
|
|
3600
|
+
question: promptInputs.question,
|
|
3466
3601
|
guidelines: promptInputs.guidelines,
|
|
3467
3602
|
guideline_paths: evalCase.guideline_paths,
|
|
3468
3603
|
system_message: promptInputs.systemMessage ?? ""
|
|
3469
3604
|
};
|
|
3470
3605
|
return {
|
|
3471
3606
|
eval_id: evalCase.id,
|
|
3607
|
+
dataset: evalCase.dataset,
|
|
3472
3608
|
conversation_id: evalCase.conversation_id,
|
|
3473
3609
|
score: score.score,
|
|
3474
3610
|
hits: score.hits,
|
|
3475
3611
|
misses: score.misses,
|
|
3476
|
-
|
|
3612
|
+
candidate_answer: candidate,
|
|
3477
3613
|
expected_aspect_count: score.expectedAspectCount,
|
|
3478
3614
|
target: target.name,
|
|
3479
3615
|
timestamp: completedAt.toISOString(),
|
|
@@ -3645,7 +3781,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3645
3781
|
async function resolveCustomPrompt(config) {
|
|
3646
3782
|
if (config.promptPath) {
|
|
3647
3783
|
try {
|
|
3648
|
-
return await
|
|
3784
|
+
return await readTextFile(config.promptPath);
|
|
3649
3785
|
} catch (error) {
|
|
3650
3786
|
const message = error instanceof Error ? error.message : String(error);
|
|
3651
3787
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
@@ -3683,7 +3819,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
|
3683
3819
|
await mkdir2(path7.dirname(filePath), { recursive: true });
|
|
3684
3820
|
const payload = {
|
|
3685
3821
|
eval_id: evalCase.id,
|
|
3686
|
-
|
|
3822
|
+
question: promptInputs.question,
|
|
3687
3823
|
guidelines: promptInputs.guidelines,
|
|
3688
3824
|
guideline_paths: evalCase.guideline_paths
|
|
3689
3825
|
};
|
|
@@ -3705,7 +3841,7 @@ async function invokeProvider(provider, options) {
|
|
|
3705
3841
|
}
|
|
3706
3842
|
try {
|
|
3707
3843
|
return await provider.invoke({
|
|
3708
|
-
|
|
3844
|
+
question: promptInputs.question,
|
|
3709
3845
|
guidelines: promptInputs.guidelines,
|
|
3710
3846
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3711
3847
|
inputFiles: evalCase.file_paths,
|
|
@@ -3725,7 +3861,7 @@ async function invokeProvider(provider, options) {
|
|
|
3725
3861
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
3726
3862
|
const message = error instanceof Error ? error.message : String(error);
|
|
3727
3863
|
const rawRequest = {
|
|
3728
|
-
|
|
3864
|
+
question: promptInputs.question,
|
|
3729
3865
|
guidelines: promptInputs.guidelines,
|
|
3730
3866
|
guideline_paths: evalCase.guideline_paths,
|
|
3731
3867
|
system_message: promptInputs.systemMessage ?? "",
|
|
@@ -3733,11 +3869,12 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
3733
3869
|
};
|
|
3734
3870
|
return {
|
|
3735
3871
|
eval_id: evalCase.id,
|
|
3872
|
+
dataset: evalCase.dataset,
|
|
3736
3873
|
conversation_id: evalCase.conversation_id,
|
|
3737
3874
|
score: 0,
|
|
3738
3875
|
hits: [],
|
|
3739
3876
|
misses: [`Error: ${message}`],
|
|
3740
|
-
|
|
3877
|
+
candidate_answer: `Error occurred: ${message}`,
|
|
3741
3878
|
expected_aspect_count: 0,
|
|
3742
3879
|
target: targetName,
|
|
3743
3880
|
timestamp: timestamp.toISOString(),
|
|
@@ -3750,7 +3887,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
3750
3887
|
hash.update(provider.id);
|
|
3751
3888
|
hash.update(target.name);
|
|
3752
3889
|
hash.update(evalCase.id);
|
|
3753
|
-
hash.update(promptInputs.
|
|
3890
|
+
hash.update(promptInputs.question);
|
|
3754
3891
|
hash.update(promptInputs.guidelines);
|
|
3755
3892
|
hash.update(promptInputs.systemMessage ?? "");
|
|
3756
3893
|
return hash.digest("hex");
|
|
@@ -3782,6 +3919,7 @@ export {
|
|
|
3782
3919
|
buildDirectoryChain,
|
|
3783
3920
|
buildPromptInputs,
|
|
3784
3921
|
buildSearchRoots,
|
|
3922
|
+
consumeCodexLogEntries,
|
|
3785
3923
|
createAgentKernel,
|
|
3786
3924
|
createProvider,
|
|
3787
3925
|
ensureVSCodeSubagents,
|
|
@@ -3798,10 +3936,12 @@ export {
|
|
|
3798
3936
|
listTargetNames,
|
|
3799
3937
|
loadEvalCases,
|
|
3800
3938
|
readTargetDefinitions,
|
|
3939
|
+
readTextFile,
|
|
3801
3940
|
resolveAndCreateProvider,
|
|
3802
3941
|
resolveFileReference,
|
|
3803
3942
|
resolveTargetDefinition,
|
|
3804
3943
|
runEvalCase,
|
|
3805
|
-
runEvaluation
|
|
3944
|
+
runEvaluation,
|
|
3945
|
+
subscribeToCodexLogEntries
|
|
3806
3946
|
};
|
|
3807
3947
|
//# sourceMappingURL=index.js.map
|