@agentv/core 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-OW3SHBIJ.js → chunk-L7I5UTJU.js} +1 -1
- package/dist/{chunk-OW3SHBIJ.js.map → chunk-L7I5UTJU.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +221 -242
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -13
- package/dist/index.d.ts +11 -13
- package/dist/index.js +222 -243
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
findGitRoot,
|
|
7
7
|
readTextFile,
|
|
8
8
|
resolveFileReference
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-L7I5UTJU.js";
|
|
10
10
|
|
|
11
11
|
// src/evaluation/types.ts
|
|
12
12
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -134,6 +134,87 @@ function extractCodeBlocks(segments) {
|
|
|
134
134
|
}
|
|
135
135
|
return codeBlocks;
|
|
136
136
|
}
|
|
137
|
+
async function processMessages(options) {
|
|
138
|
+
const {
|
|
139
|
+
messages,
|
|
140
|
+
searchRoots,
|
|
141
|
+
repoRootPath,
|
|
142
|
+
guidelinePatterns,
|
|
143
|
+
guidelinePaths,
|
|
144
|
+
textParts,
|
|
145
|
+
messageType,
|
|
146
|
+
verbose
|
|
147
|
+
} = options;
|
|
148
|
+
const segments = [];
|
|
149
|
+
for (const message of messages) {
|
|
150
|
+
const content = message.content;
|
|
151
|
+
if (typeof content === "string") {
|
|
152
|
+
segments.push({ type: "text", value: content });
|
|
153
|
+
if (textParts) {
|
|
154
|
+
textParts.push(content);
|
|
155
|
+
}
|
|
156
|
+
continue;
|
|
157
|
+
}
|
|
158
|
+
for (const rawSegment of content) {
|
|
159
|
+
if (!isJsonObject(rawSegment)) {
|
|
160
|
+
continue;
|
|
161
|
+
}
|
|
162
|
+
const segmentType = asString(rawSegment.type);
|
|
163
|
+
if (segmentType === "file") {
|
|
164
|
+
const rawValue = asString(rawSegment.value);
|
|
165
|
+
if (!rawValue) {
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
169
|
+
rawValue,
|
|
170
|
+
searchRoots
|
|
171
|
+
);
|
|
172
|
+
if (!resolvedPath) {
|
|
173
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
174
|
+
const context = messageType === "input" ? "" : " in expected_messages";
|
|
175
|
+
logWarning(`File not found${context}: ${displayPath}`, attempts);
|
|
176
|
+
continue;
|
|
177
|
+
}
|
|
178
|
+
try {
|
|
179
|
+
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
180
|
+
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
181
|
+
const relativeToRepo = path.relative(repoRootPath, resolvedPath);
|
|
182
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
183
|
+
guidelinePaths.push(path.resolve(resolvedPath));
|
|
184
|
+
if (verbose) {
|
|
185
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
186
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
187
|
+
}
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
segments.push({
|
|
192
|
+
type: "file",
|
|
193
|
+
path: displayPath,
|
|
194
|
+
text: fileContent,
|
|
195
|
+
resolvedPath: path.resolve(resolvedPath)
|
|
196
|
+
});
|
|
197
|
+
if (verbose) {
|
|
198
|
+
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
199
|
+
console.log(` ${label} Found: ${displayPath}`);
|
|
200
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
201
|
+
}
|
|
202
|
+
} catch (error) {
|
|
203
|
+
const context = messageType === "input" ? "" : " expected output";
|
|
204
|
+
logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
205
|
+
}
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
const clonedSegment = cloneJsonObject(rawSegment);
|
|
209
|
+
segments.push(clonedSegment);
|
|
210
|
+
const inlineValue = clonedSegment.value;
|
|
211
|
+
if (typeof inlineValue === "string" && textParts) {
|
|
212
|
+
textParts.push(inlineValue);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
return segments;
|
|
217
|
+
}
|
|
137
218
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
138
219
|
const verbose = options?.verbose ?? false;
|
|
139
220
|
const absoluteTestPath = path.resolve(evalFilePath);
|
|
@@ -219,77 +300,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
219
300
|
}
|
|
220
301
|
}
|
|
221
302
|
}
|
|
222
|
-
const userSegments = [];
|
|
223
303
|
const guidelinePaths = [];
|
|
224
|
-
const
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
searchRoots
|
|
245
|
-
);
|
|
246
|
-
if (!resolvedPath) {
|
|
247
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
248
|
-
logWarning(`File not found: ${displayPath}`, attempts);
|
|
249
|
-
continue;
|
|
250
|
-
}
|
|
251
|
-
try {
|
|
252
|
-
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
253
|
-
const relativeToRepo = path.relative(repoRootPath, resolvedPath);
|
|
254
|
-
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
255
|
-
guidelinePaths.push(path.resolve(resolvedPath));
|
|
256
|
-
if (verbose) {
|
|
257
|
-
console.log(` [Guideline] Found: ${displayPath}`);
|
|
258
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
259
|
-
}
|
|
260
|
-
} else {
|
|
261
|
-
userSegments.push({
|
|
262
|
-
type: "file",
|
|
263
|
-
path: displayPath,
|
|
264
|
-
text: fileContent,
|
|
265
|
-
resolvedPath: path.resolve(resolvedPath)
|
|
266
|
-
});
|
|
267
|
-
if (verbose) {
|
|
268
|
-
console.log(` [File] Found: ${displayPath}`);
|
|
269
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
} catch (error) {
|
|
273
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
274
|
-
}
|
|
275
|
-
continue;
|
|
276
|
-
}
|
|
277
|
-
const clonedSegment = cloneJsonObject(rawSegment);
|
|
278
|
-
userSegments.push(clonedSegment);
|
|
279
|
-
const inlineValue = clonedSegment.value;
|
|
280
|
-
if (typeof inlineValue === "string") {
|
|
281
|
-
userTextParts.push(inlineValue);
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
}
|
|
285
|
-
const codeSnippets = extractCodeBlocks(userSegments);
|
|
304
|
+
const inputTextParts = [];
|
|
305
|
+
const inputSegments = await processMessages({
|
|
306
|
+
messages: userMessages,
|
|
307
|
+
searchRoots,
|
|
308
|
+
repoRootPath,
|
|
309
|
+
guidelinePatterns,
|
|
310
|
+
guidelinePaths,
|
|
311
|
+
textParts: inputTextParts,
|
|
312
|
+
messageType: "input",
|
|
313
|
+
verbose
|
|
314
|
+
});
|
|
315
|
+
const outputSegments = await processMessages({
|
|
316
|
+
messages: assistantMessages,
|
|
317
|
+
searchRoots,
|
|
318
|
+
repoRootPath,
|
|
319
|
+
guidelinePatterns,
|
|
320
|
+
messageType: "output",
|
|
321
|
+
verbose
|
|
322
|
+
});
|
|
323
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
286
324
|
const assistantContent = assistantMessages[0]?.content;
|
|
287
|
-
const
|
|
288
|
-
const
|
|
325
|
+
const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
326
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
289
327
|
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
290
328
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
291
329
|
const userFilePaths = [];
|
|
292
|
-
for (const segment of
|
|
330
|
+
for (const segment of inputSegments) {
|
|
293
331
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
294
332
|
userFilePaths.push(segment.resolvedPath);
|
|
295
333
|
}
|
|
@@ -302,15 +340,16 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
302
340
|
id,
|
|
303
341
|
dataset: datasetName,
|
|
304
342
|
conversation_id: conversationId,
|
|
305
|
-
|
|
306
|
-
|
|
343
|
+
question,
|
|
344
|
+
input_segments: inputSegments,
|
|
345
|
+
output_segments: outputSegments,
|
|
307
346
|
system_message: systemMessageContent,
|
|
308
|
-
|
|
347
|
+
reference_answer: referenceAnswer,
|
|
309
348
|
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
310
349
|
guideline_patterns: guidelinePatterns,
|
|
311
350
|
file_paths: allFilePaths,
|
|
312
351
|
code_snippets: codeSnippets,
|
|
313
|
-
outcome,
|
|
352
|
+
expected_outcome: outcome,
|
|
314
353
|
evaluator: testCaseEvaluatorKind,
|
|
315
354
|
evaluators
|
|
316
355
|
};
|
|
@@ -346,36 +385,36 @@ ${content}`);
|
|
|
346
385
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
347
386
|
}
|
|
348
387
|
}
|
|
349
|
-
const
|
|
350
|
-
for (const segment of testCase.
|
|
388
|
+
const questionParts = [];
|
|
389
|
+
for (const segment of testCase.input_segments) {
|
|
351
390
|
const typeValue = segment.type;
|
|
352
391
|
if (typeof typeValue === "string" && typeValue === "file") {
|
|
353
392
|
const pathValue = segment.path;
|
|
354
393
|
const textValue = segment.text;
|
|
355
394
|
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
356
395
|
const body = typeof textValue === "string" ? textValue : "";
|
|
357
|
-
|
|
396
|
+
questionParts.push(`=== ${label} ===
|
|
358
397
|
${body}`);
|
|
359
398
|
continue;
|
|
360
399
|
}
|
|
361
400
|
if (typeof typeValue === "string" && typeValue === "text") {
|
|
362
401
|
const value = segment.value;
|
|
363
402
|
if (typeof value === "string") {
|
|
364
|
-
|
|
403
|
+
questionParts.push(value);
|
|
365
404
|
}
|
|
366
405
|
continue;
|
|
367
406
|
}
|
|
368
407
|
const genericValue = segment.value;
|
|
369
408
|
if (typeof genericValue === "string") {
|
|
370
|
-
|
|
409
|
+
questionParts.push(genericValue);
|
|
371
410
|
}
|
|
372
411
|
}
|
|
373
412
|
if (testCase.code_snippets.length > 0) {
|
|
374
|
-
|
|
413
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
375
414
|
}
|
|
376
|
-
const
|
|
415
|
+
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
377
416
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
378
|
-
return {
|
|
417
|
+
return { question, guidelines, systemMessage: testCase.system_message };
|
|
379
418
|
}
|
|
380
419
|
async function fileExists2(absolutePath) {
|
|
381
420
|
try {
|
|
@@ -587,7 +626,7 @@ function buildChatPrompt(request) {
|
|
|
587
626
|
${request.guidelines.trim()}`);
|
|
588
627
|
}
|
|
589
628
|
const systemContent = systemSegments.join("\n\n");
|
|
590
|
-
const userContent = request.
|
|
629
|
+
const userContent = request.question.trim();
|
|
591
630
|
const prompt = [
|
|
592
631
|
{
|
|
593
632
|
role: "system",
|
|
@@ -885,7 +924,7 @@ var CliProvider = class {
|
|
|
885
924
|
healthcheck.commandTemplate,
|
|
886
925
|
buildTemplateValues(
|
|
887
926
|
{
|
|
888
|
-
|
|
927
|
+
question: "",
|
|
889
928
|
guidelines: "",
|
|
890
929
|
inputFiles: [],
|
|
891
930
|
evalCaseId: "",
|
|
@@ -912,7 +951,7 @@ var CliProvider = class {
|
|
|
912
951
|
function buildTemplateValues(request, config) {
|
|
913
952
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
914
953
|
return {
|
|
915
|
-
PROMPT: shellEscape(request.
|
|
954
|
+
PROMPT: shellEscape(request.question ?? ""),
|
|
916
955
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
917
956
|
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
918
957
|
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
@@ -976,6 +1015,59 @@ import { tmpdir } from "node:os";
|
|
|
976
1015
|
import path4 from "node:path";
|
|
977
1016
|
import { promisify as promisify2 } from "node:util";
|
|
978
1017
|
|
|
1018
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1019
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1020
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1021
|
+
function getCodexLogStore() {
|
|
1022
|
+
const globalObject = globalThis;
|
|
1023
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1024
|
+
if (existing) {
|
|
1025
|
+
return existing;
|
|
1026
|
+
}
|
|
1027
|
+
const created = [];
|
|
1028
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1029
|
+
return created;
|
|
1030
|
+
}
|
|
1031
|
+
function getSubscriberStore() {
|
|
1032
|
+
const globalObject = globalThis;
|
|
1033
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1034
|
+
if (existing) {
|
|
1035
|
+
return existing;
|
|
1036
|
+
}
|
|
1037
|
+
const created = /* @__PURE__ */ new Set();
|
|
1038
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1039
|
+
return created;
|
|
1040
|
+
}
|
|
1041
|
+
function notifySubscribers(entry) {
|
|
1042
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1043
|
+
for (const listener of subscribers) {
|
|
1044
|
+
try {
|
|
1045
|
+
listener(entry);
|
|
1046
|
+
} catch (error) {
|
|
1047
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1048
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
function recordCodexLogEntry(entry) {
|
|
1053
|
+
getCodexLogStore().push(entry);
|
|
1054
|
+
notifySubscribers(entry);
|
|
1055
|
+
}
|
|
1056
|
+
function consumeCodexLogEntries() {
|
|
1057
|
+
const store = getCodexLogStore();
|
|
1058
|
+
if (store.length === 0) {
|
|
1059
|
+
return [];
|
|
1060
|
+
}
|
|
1061
|
+
return store.splice(0, store.length);
|
|
1062
|
+
}
|
|
1063
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1064
|
+
const store = getSubscriberStore();
|
|
1065
|
+
store.add(listener);
|
|
1066
|
+
return () => {
|
|
1067
|
+
store.delete(listener);
|
|
1068
|
+
};
|
|
1069
|
+
}
|
|
1070
|
+
|
|
979
1071
|
// src/evaluation/providers/preread.ts
|
|
980
1072
|
import path3 from "node:path";
|
|
981
1073
|
function buildPromptDocument(request, inputFiles, options) {
|
|
@@ -993,7 +1085,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
993
1085
|
if (prereadBlock.length > 0) {
|
|
994
1086
|
parts.push("\n", prereadBlock);
|
|
995
1087
|
}
|
|
996
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
1088
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
997
1089
|
return parts.join("\n").trim();
|
|
998
1090
|
}
|
|
999
1091
|
function normalizeInputFiles2(inputFiles) {
|
|
@@ -1077,59 +1169,6 @@ function pathToFileUri(filePath) {
|
|
|
1077
1169
|
return `file://${normalizedPath}`;
|
|
1078
1170
|
}
|
|
1079
1171
|
|
|
1080
|
-
// src/evaluation/providers/codex-log-tracker.ts
|
|
1081
|
-
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1082
|
-
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1083
|
-
function getCodexLogStore() {
|
|
1084
|
-
const globalObject = globalThis;
|
|
1085
|
-
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1086
|
-
if (existing) {
|
|
1087
|
-
return existing;
|
|
1088
|
-
}
|
|
1089
|
-
const created = [];
|
|
1090
|
-
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1091
|
-
return created;
|
|
1092
|
-
}
|
|
1093
|
-
function getSubscriberStore() {
|
|
1094
|
-
const globalObject = globalThis;
|
|
1095
|
-
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1096
|
-
if (existing) {
|
|
1097
|
-
return existing;
|
|
1098
|
-
}
|
|
1099
|
-
const created = /* @__PURE__ */ new Set();
|
|
1100
|
-
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1101
|
-
return created;
|
|
1102
|
-
}
|
|
1103
|
-
function notifySubscribers(entry) {
|
|
1104
|
-
const subscribers = Array.from(getSubscriberStore());
|
|
1105
|
-
for (const listener of subscribers) {
|
|
1106
|
-
try {
|
|
1107
|
-
listener(entry);
|
|
1108
|
-
} catch (error) {
|
|
1109
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1110
|
-
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1111
|
-
}
|
|
1112
|
-
}
|
|
1113
|
-
}
|
|
1114
|
-
function recordCodexLogEntry(entry) {
|
|
1115
|
-
getCodexLogStore().push(entry);
|
|
1116
|
-
notifySubscribers(entry);
|
|
1117
|
-
}
|
|
1118
|
-
function consumeCodexLogEntries() {
|
|
1119
|
-
const store = getCodexLogStore();
|
|
1120
|
-
if (store.length === 0) {
|
|
1121
|
-
return [];
|
|
1122
|
-
}
|
|
1123
|
-
return store.splice(0, store.length);
|
|
1124
|
-
}
|
|
1125
|
-
function subscribeToCodexLogEntries(listener) {
|
|
1126
|
-
const store = getSubscriberStore();
|
|
1127
|
-
store.add(listener);
|
|
1128
|
-
return () => {
|
|
1129
|
-
store.delete(listener);
|
|
1130
|
-
};
|
|
1131
|
-
}
|
|
1132
|
-
|
|
1133
1172
|
// src/evaluation/providers/codex.ts
|
|
1134
1173
|
var execAsync2 = promisify2(execCallback);
|
|
1135
1174
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
@@ -1863,7 +1902,7 @@ var MockProvider = class {
|
|
|
1863
1902
|
return {
|
|
1864
1903
|
text: this.cannedResponse,
|
|
1865
1904
|
raw: {
|
|
1866
|
-
|
|
1905
|
+
question: request.question,
|
|
1867
1906
|
guidelines: request.guidelines
|
|
1868
1907
|
}
|
|
1869
1908
|
};
|
|
@@ -2479,7 +2518,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
2479
2518
|
if (prereadBlock.length > 0) {
|
|
2480
2519
|
parts.push("\n", prereadBlock);
|
|
2481
2520
|
}
|
|
2482
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
2521
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2483
2522
|
return parts.join("\n").trim();
|
|
2484
2523
|
}
|
|
2485
2524
|
function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
@@ -2716,30 +2755,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2716
2755
|
}
|
|
2717
2756
|
|
|
2718
2757
|
// src/evaluation/evaluators.ts
|
|
2719
|
-
import { ax, f } from "@ax-llm/ax";
|
|
2720
2758
|
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
2721
|
-
var LLM_JUDGE_SIGNATURE = f().input(
|
|
2722
|
-
"evaluationContext",
|
|
2723
|
-
f.object(
|
|
2724
|
-
{
|
|
2725
|
-
expectedOutcome: f.string("The expected outcome for the original task"),
|
|
2726
|
-
request: f.string("The original task request"),
|
|
2727
|
-
referenceAnswer: f.string("The gold standard reference answer"),
|
|
2728
|
-
generatedAnswer: f.string("The answer to evaluate"),
|
|
2729
|
-
guidelines: f.string("Additional evaluation guidelines or instructions").optional()
|
|
2730
|
-
},
|
|
2731
|
-
"Complete evaluation context for the judge"
|
|
2732
|
-
)
|
|
2733
|
-
).output(
|
|
2734
|
-
"evaluation",
|
|
2735
|
-
f.object({
|
|
2736
|
-
score: f.number("Score between 0.0 and 1.0").min(0).max(1),
|
|
2737
|
-
hits: f.string("Brief specific achievement").array(),
|
|
2738
|
-
misses: f.string("Brief specific failure or omission").array(),
|
|
2739
|
-
reasoning: f.string("Concise explanation for the score").max(500)
|
|
2740
|
-
})
|
|
2741
|
-
).build();
|
|
2742
|
-
var LLM_JUDGE = ax(LLM_JUDGE_SIGNATURE);
|
|
2743
2759
|
var LlmJudgeEvaluator = class {
|
|
2744
2760
|
kind = "llm_judge";
|
|
2745
2761
|
resolveJudgeProvider;
|
|
@@ -2757,52 +2773,29 @@ var LlmJudgeEvaluator = class {
|
|
|
2757
2773
|
if (!judgeProvider) {
|
|
2758
2774
|
throw new Error("No judge provider available for LLM grading");
|
|
2759
2775
|
}
|
|
2760
|
-
if (providerSupportsAx(judgeProvider)) {
|
|
2761
|
-
return this.evaluateWithAx(context, judgeProvider);
|
|
2762
|
-
}
|
|
2763
2776
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2764
2777
|
}
|
|
2765
|
-
async evaluateWithAx(context, judgeProvider) {
|
|
2766
|
-
const ai = judgeProvider.getAxAI();
|
|
2767
|
-
const guidelines = context.promptInputs.guidelines?.trim();
|
|
2768
|
-
const evaluationContext = {
|
|
2769
|
-
expectedOutcome: context.evalCase.outcome.trim(),
|
|
2770
|
-
request: context.evalCase.task.trim(),
|
|
2771
|
-
referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
|
|
2772
|
-
generatedAnswer: context.candidate.trim(),
|
|
2773
|
-
...guidelines ? { guidelines } : {}
|
|
2774
|
-
};
|
|
2775
|
-
const options = this.buildJudgeForwardOptions(context);
|
|
2776
|
-
const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
|
|
2777
|
-
const evaluation = result.evaluation;
|
|
2778
|
-
const expectedAspectCount = Math.max(
|
|
2779
|
-
evaluation.hits.length + evaluation.misses.length,
|
|
2780
|
-
1
|
|
2781
|
-
);
|
|
2782
|
-
return {
|
|
2783
|
-
score: evaluation.score,
|
|
2784
|
-
hits: evaluation.hits,
|
|
2785
|
-
misses: evaluation.misses,
|
|
2786
|
-
expectedAspectCount,
|
|
2787
|
-
reasoning: evaluation.reasoning,
|
|
2788
|
-
evaluatorRawRequest: {
|
|
2789
|
-
id: randomUUID2(),
|
|
2790
|
-
provider: judgeProvider.id,
|
|
2791
|
-
target: context.target.name,
|
|
2792
|
-
method: "ax-structured-output",
|
|
2793
|
-
signature: LLM_JUDGE_SIGNATURE.toString()
|
|
2794
|
-
}
|
|
2795
|
-
};
|
|
2796
|
-
}
|
|
2797
2778
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2798
|
-
|
|
2799
|
-
|
|
2779
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2780
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2781
|
+
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2782
|
+
const variables = {
|
|
2783
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2784
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2785
|
+
candidate_answer: context.candidate,
|
|
2786
|
+
reference_answer: context.evalCase.reference_answer,
|
|
2787
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
2788
|
+
question: context.evalCase.question
|
|
2789
|
+
};
|
|
2790
|
+
prompt = substituteVariables(systemPrompt, variables);
|
|
2791
|
+
systemPrompt = QUALITY_SYSTEM_PROMPT;
|
|
2792
|
+
}
|
|
2800
2793
|
const metadata = {
|
|
2801
2794
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2802
2795
|
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2803
2796
|
};
|
|
2804
2797
|
const response = await judgeProvider.invoke({
|
|
2805
|
-
prompt,
|
|
2798
|
+
question: prompt,
|
|
2806
2799
|
metadata,
|
|
2807
2800
|
evalCaseId: context.evalCase.id,
|
|
2808
2801
|
attempt: context.attempt,
|
|
@@ -2832,33 +2825,11 @@ var LlmJudgeEvaluator = class {
|
|
|
2832
2825
|
evaluatorRawRequest
|
|
2833
2826
|
};
|
|
2834
2827
|
}
|
|
2835
|
-
buildJudgeForwardOptions(context) {
|
|
2836
|
-
const modelConfig = this.buildJudgeModelConfig();
|
|
2837
|
-
if (modelConfig === void 0 && context.judgeModel === void 0) {
|
|
2838
|
-
return void 0;
|
|
2839
|
-
}
|
|
2840
|
-
return {
|
|
2841
|
-
...context.judgeModel ? { model: context.judgeModel } : {},
|
|
2842
|
-
...modelConfig ? { modelConfig } : {}
|
|
2843
|
-
};
|
|
2844
|
-
}
|
|
2845
|
-
buildJudgeModelConfig() {
|
|
2846
|
-
if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
|
|
2847
|
-
return void 0;
|
|
2848
|
-
}
|
|
2849
|
-
return {
|
|
2850
|
-
...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
|
|
2851
|
-
...this.temperature !== void 0 ? { temperature: this.temperature } : {}
|
|
2852
|
-
};
|
|
2853
|
-
}
|
|
2854
2828
|
};
|
|
2855
|
-
function providerSupportsAx(provider) {
|
|
2856
|
-
return typeof provider.getAxAI === "function";
|
|
2857
|
-
}
|
|
2858
2829
|
var QUALITY_SYSTEM_PROMPT = [
|
|
2859
|
-
"You are an expert evaluator. Your goal is to grade the
|
|
2830
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2860
2831
|
"",
|
|
2861
|
-
"Use the reference_answer as a gold standard for a high-quality response. The
|
|
2832
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
|
|
2862
2833
|
"",
|
|
2863
2834
|
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2864
2835
|
"",
|
|
@@ -2871,18 +2842,18 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
2871
2842
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2872
2843
|
"}"
|
|
2873
2844
|
].join("\n");
|
|
2874
|
-
function buildQualityPrompt(
|
|
2845
|
+
function buildQualityPrompt(evalCase, candidate) {
|
|
2875
2846
|
const parts = [
|
|
2876
2847
|
"[[ ## expected_outcome ## ]]",
|
|
2877
|
-
|
|
2848
|
+
evalCase.expected_outcome.trim(),
|
|
2878
2849
|
"",
|
|
2879
|
-
"[[ ##
|
|
2880
|
-
|
|
2850
|
+
"[[ ## question ## ]]",
|
|
2851
|
+
evalCase.question.trim(),
|
|
2881
2852
|
"",
|
|
2882
2853
|
"[[ ## reference_answer ## ]]",
|
|
2883
|
-
|
|
2854
|
+
evalCase.reference_answer.trim(),
|
|
2884
2855
|
"",
|
|
2885
|
-
"[[ ##
|
|
2856
|
+
"[[ ## candidate_answer ## ]]",
|
|
2886
2857
|
candidate.trim(),
|
|
2887
2858
|
"",
|
|
2888
2859
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
@@ -2982,14 +2953,14 @@ var CodeEvaluator = class {
|
|
|
2982
2953
|
async evaluate(context) {
|
|
2983
2954
|
const inputPayload = JSON.stringify(
|
|
2984
2955
|
{
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2956
|
+
question: context.evalCase.question,
|
|
2957
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
2958
|
+
reference_answer: context.evalCase.reference_answer,
|
|
2959
|
+
candidate_answer: context.candidate,
|
|
2989
2960
|
system_message: context.promptInputs.systemMessage ?? "",
|
|
2990
2961
|
guideline_paths: context.evalCase.guideline_paths,
|
|
2991
|
-
|
|
2992
|
-
|
|
2962
|
+
input_files: context.evalCase.file_paths,
|
|
2963
|
+
input_segments: context.evalCase.input_segments
|
|
2993
2964
|
},
|
|
2994
2965
|
null,
|
|
2995
2966
|
2
|
|
@@ -3075,6 +3046,14 @@ function parseJsonSafe(payload) {
|
|
|
3075
3046
|
return void 0;
|
|
3076
3047
|
}
|
|
3077
3048
|
}
|
|
3049
|
+
function hasTemplateVariables(text) {
|
|
3050
|
+
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
3051
|
+
}
|
|
3052
|
+
function substituteVariables(template, variables) {
|
|
3053
|
+
return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
|
|
3054
|
+
return variables[varName] ?? match;
|
|
3055
|
+
});
|
|
3056
|
+
}
|
|
3078
3057
|
|
|
3079
3058
|
// src/evaluation/orchestrator.ts
|
|
3080
3059
|
import { createHash, randomUUID as randomUUID3 } from "node:crypto";
|
|
@@ -3431,7 +3410,7 @@ async function runBatchEvaluation(options) {
|
|
|
3431
3410
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
3432
3411
|
const promptInputs = promptInputsList[index];
|
|
3433
3412
|
return {
|
|
3434
|
-
|
|
3413
|
+
question: promptInputs.question,
|
|
3435
3414
|
guidelines: promptInputs.guidelines,
|
|
3436
3415
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3437
3416
|
inputFiles: evalCase.file_paths,
|
|
@@ -3618,7 +3597,7 @@ async function evaluateCandidate(options) {
|
|
|
3618
3597
|
});
|
|
3619
3598
|
const completedAt = nowFn();
|
|
3620
3599
|
const rawRequest = {
|
|
3621
|
-
|
|
3600
|
+
question: promptInputs.question,
|
|
3622
3601
|
guidelines: promptInputs.guidelines,
|
|
3623
3602
|
guideline_paths: evalCase.guideline_paths,
|
|
3624
3603
|
system_message: promptInputs.systemMessage ?? ""
|
|
@@ -3630,7 +3609,7 @@ async function evaluateCandidate(options) {
|
|
|
3630
3609
|
score: score.score,
|
|
3631
3610
|
hits: score.hits,
|
|
3632
3611
|
misses: score.misses,
|
|
3633
|
-
|
|
3612
|
+
candidate_answer: candidate,
|
|
3634
3613
|
expected_aspect_count: score.expectedAspectCount,
|
|
3635
3614
|
target: target.name,
|
|
3636
3615
|
timestamp: completedAt.toISOString(),
|
|
@@ -3840,7 +3819,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
|
3840
3819
|
await mkdir2(path7.dirname(filePath), { recursive: true });
|
|
3841
3820
|
const payload = {
|
|
3842
3821
|
eval_id: evalCase.id,
|
|
3843
|
-
|
|
3822
|
+
question: promptInputs.question,
|
|
3844
3823
|
guidelines: promptInputs.guidelines,
|
|
3845
3824
|
guideline_paths: evalCase.guideline_paths
|
|
3846
3825
|
};
|
|
@@ -3862,7 +3841,7 @@ async function invokeProvider(provider, options) {
|
|
|
3862
3841
|
}
|
|
3863
3842
|
try {
|
|
3864
3843
|
return await provider.invoke({
|
|
3865
|
-
|
|
3844
|
+
question: promptInputs.question,
|
|
3866
3845
|
guidelines: promptInputs.guidelines,
|
|
3867
3846
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3868
3847
|
inputFiles: evalCase.file_paths,
|
|
@@ -3882,7 +3861,7 @@ async function invokeProvider(provider, options) {
|
|
|
3882
3861
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
3883
3862
|
const message = error instanceof Error ? error.message : String(error);
|
|
3884
3863
|
const rawRequest = {
|
|
3885
|
-
|
|
3864
|
+
question: promptInputs.question,
|
|
3886
3865
|
guidelines: promptInputs.guidelines,
|
|
3887
3866
|
guideline_paths: evalCase.guideline_paths,
|
|
3888
3867
|
system_message: promptInputs.systemMessage ?? "",
|
|
@@ -3895,7 +3874,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
3895
3874
|
score: 0,
|
|
3896
3875
|
hits: [],
|
|
3897
3876
|
misses: [`Error: ${message}`],
|
|
3898
|
-
|
|
3877
|
+
candidate_answer: `Error occurred: ${message}`,
|
|
3899
3878
|
expected_aspect_count: 0,
|
|
3900
3879
|
target: targetName,
|
|
3901
3880
|
timestamp: timestamp.toISOString(),
|
|
@@ -3908,7 +3887,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
3908
3887
|
hash.update(provider.id);
|
|
3909
3888
|
hash.update(target.name);
|
|
3910
3889
|
hash.update(evalCase.id);
|
|
3911
|
-
hash.update(promptInputs.
|
|
3890
|
+
hash.update(promptInputs.question);
|
|
3912
3891
|
hash.update(promptInputs.guidelines);
|
|
3913
3892
|
hash.update(promptInputs.systemMessage ?? "");
|
|
3914
3893
|
return hash.digest("hex");
|