@agentv/core 0.6.1 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-OW3SHBIJ.js → chunk-UQLHF3T7.js} +12 -3
- package/dist/chunk-UQLHF3T7.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +143 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +1 -1
- package/dist/evaluation/validation/index.d.ts +1 -1
- package/dist/evaluation/validation/index.js +143 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +277 -328
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -13
- package/dist/index.d.ts +11 -13
- package/dist/index.js +267 -325
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-OW3SHBIJ.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -4,9 +4,10 @@ import {
|
|
|
4
4
|
buildSearchRoots,
|
|
5
5
|
fileExists,
|
|
6
6
|
findGitRoot,
|
|
7
|
+
isAgentProvider,
|
|
7
8
|
readTextFile,
|
|
8
9
|
resolveFileReference
|
|
9
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-UQLHF3T7.js";
|
|
10
11
|
|
|
11
12
|
// src/evaluation/types.ts
|
|
12
13
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -134,6 +135,87 @@ function extractCodeBlocks(segments) {
|
|
|
134
135
|
}
|
|
135
136
|
return codeBlocks;
|
|
136
137
|
}
|
|
138
|
+
async function processMessages(options) {
|
|
139
|
+
const {
|
|
140
|
+
messages,
|
|
141
|
+
searchRoots,
|
|
142
|
+
repoRootPath,
|
|
143
|
+
guidelinePatterns,
|
|
144
|
+
guidelinePaths,
|
|
145
|
+
textParts,
|
|
146
|
+
messageType,
|
|
147
|
+
verbose
|
|
148
|
+
} = options;
|
|
149
|
+
const segments = [];
|
|
150
|
+
for (const message of messages) {
|
|
151
|
+
const content = message.content;
|
|
152
|
+
if (typeof content === "string") {
|
|
153
|
+
segments.push({ type: "text", value: content });
|
|
154
|
+
if (textParts) {
|
|
155
|
+
textParts.push(content);
|
|
156
|
+
}
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
for (const rawSegment of content) {
|
|
160
|
+
if (!isJsonObject(rawSegment)) {
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
163
|
+
const segmentType = asString(rawSegment.type);
|
|
164
|
+
if (segmentType === "file") {
|
|
165
|
+
const rawValue = asString(rawSegment.value);
|
|
166
|
+
if (!rawValue) {
|
|
167
|
+
continue;
|
|
168
|
+
}
|
|
169
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
170
|
+
rawValue,
|
|
171
|
+
searchRoots
|
|
172
|
+
);
|
|
173
|
+
if (!resolvedPath) {
|
|
174
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
175
|
+
const context = messageType === "input" ? "" : " in expected_messages";
|
|
176
|
+
logWarning(`File not found${context}: ${displayPath}`, attempts);
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
try {
|
|
180
|
+
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
181
|
+
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
182
|
+
const relativeToRepo = path.relative(repoRootPath, resolvedPath);
|
|
183
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
184
|
+
guidelinePaths.push(path.resolve(resolvedPath));
|
|
185
|
+
if (verbose) {
|
|
186
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
187
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
188
|
+
}
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
segments.push({
|
|
193
|
+
type: "file",
|
|
194
|
+
path: displayPath,
|
|
195
|
+
text: fileContent,
|
|
196
|
+
resolvedPath: path.resolve(resolvedPath)
|
|
197
|
+
});
|
|
198
|
+
if (verbose) {
|
|
199
|
+
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
200
|
+
console.log(` ${label} Found: ${displayPath}`);
|
|
201
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
202
|
+
}
|
|
203
|
+
} catch (error) {
|
|
204
|
+
const context = messageType === "input" ? "" : " expected output";
|
|
205
|
+
logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
206
|
+
}
|
|
207
|
+
continue;
|
|
208
|
+
}
|
|
209
|
+
const clonedSegment = cloneJsonObject(rawSegment);
|
|
210
|
+
segments.push(clonedSegment);
|
|
211
|
+
const inlineValue = clonedSegment.value;
|
|
212
|
+
if (typeof inlineValue === "string" && textParts) {
|
|
213
|
+
textParts.push(inlineValue);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
return segments;
|
|
218
|
+
}
|
|
137
219
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
138
220
|
const verbose = options?.verbose ?? false;
|
|
139
221
|
const absoluteTestPath = path.resolve(evalFilePath);
|
|
@@ -219,77 +301,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
219
301
|
}
|
|
220
302
|
}
|
|
221
303
|
}
|
|
222
|
-
const userSegments = [];
|
|
223
304
|
const guidelinePaths = [];
|
|
224
|
-
const
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
searchRoots
|
|
245
|
-
);
|
|
246
|
-
if (!resolvedPath) {
|
|
247
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
248
|
-
logWarning(`File not found: ${displayPath}`, attempts);
|
|
249
|
-
continue;
|
|
250
|
-
}
|
|
251
|
-
try {
|
|
252
|
-
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
253
|
-
const relativeToRepo = path.relative(repoRootPath, resolvedPath);
|
|
254
|
-
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
255
|
-
guidelinePaths.push(path.resolve(resolvedPath));
|
|
256
|
-
if (verbose) {
|
|
257
|
-
console.log(` [Guideline] Found: ${displayPath}`);
|
|
258
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
259
|
-
}
|
|
260
|
-
} else {
|
|
261
|
-
userSegments.push({
|
|
262
|
-
type: "file",
|
|
263
|
-
path: displayPath,
|
|
264
|
-
text: fileContent,
|
|
265
|
-
resolvedPath: path.resolve(resolvedPath)
|
|
266
|
-
});
|
|
267
|
-
if (verbose) {
|
|
268
|
-
console.log(` [File] Found: ${displayPath}`);
|
|
269
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
} catch (error) {
|
|
273
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
274
|
-
}
|
|
275
|
-
continue;
|
|
276
|
-
}
|
|
277
|
-
const clonedSegment = cloneJsonObject(rawSegment);
|
|
278
|
-
userSegments.push(clonedSegment);
|
|
279
|
-
const inlineValue = clonedSegment.value;
|
|
280
|
-
if (typeof inlineValue === "string") {
|
|
281
|
-
userTextParts.push(inlineValue);
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
}
|
|
285
|
-
const codeSnippets = extractCodeBlocks(userSegments);
|
|
305
|
+
const inputTextParts = [];
|
|
306
|
+
const inputSegments = await processMessages({
|
|
307
|
+
messages: userMessages,
|
|
308
|
+
searchRoots,
|
|
309
|
+
repoRootPath,
|
|
310
|
+
guidelinePatterns,
|
|
311
|
+
guidelinePaths,
|
|
312
|
+
textParts: inputTextParts,
|
|
313
|
+
messageType: "input",
|
|
314
|
+
verbose
|
|
315
|
+
});
|
|
316
|
+
const outputSegments = await processMessages({
|
|
317
|
+
messages: assistantMessages,
|
|
318
|
+
searchRoots,
|
|
319
|
+
repoRootPath,
|
|
320
|
+
guidelinePatterns,
|
|
321
|
+
messageType: "output",
|
|
322
|
+
verbose
|
|
323
|
+
});
|
|
324
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
286
325
|
const assistantContent = assistantMessages[0]?.content;
|
|
287
|
-
const
|
|
288
|
-
const
|
|
326
|
+
const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
327
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
289
328
|
const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
290
329
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
291
330
|
const userFilePaths = [];
|
|
292
|
-
for (const segment of
|
|
331
|
+
for (const segment of inputSegments) {
|
|
293
332
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
294
333
|
userFilePaths.push(segment.resolvedPath);
|
|
295
334
|
}
|
|
@@ -302,15 +341,16 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
302
341
|
id,
|
|
303
342
|
dataset: datasetName,
|
|
304
343
|
conversation_id: conversationId,
|
|
305
|
-
|
|
306
|
-
|
|
344
|
+
question,
|
|
345
|
+
input_segments: inputSegments,
|
|
346
|
+
output_segments: outputSegments,
|
|
307
347
|
system_message: systemMessageContent,
|
|
308
|
-
|
|
348
|
+
reference_answer: referenceAnswer,
|
|
309
349
|
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
310
350
|
guideline_patterns: guidelinePatterns,
|
|
311
351
|
file_paths: allFilePaths,
|
|
312
352
|
code_snippets: codeSnippets,
|
|
313
|
-
outcome,
|
|
353
|
+
expected_outcome: outcome,
|
|
314
354
|
evaluator: testCaseEvaluatorKind,
|
|
315
355
|
evaluators
|
|
316
356
|
};
|
|
@@ -346,36 +386,36 @@ ${content}`);
|
|
|
346
386
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
347
387
|
}
|
|
348
388
|
}
|
|
349
|
-
const
|
|
350
|
-
for (const segment of testCase.
|
|
389
|
+
const questionParts = [];
|
|
390
|
+
for (const segment of testCase.input_segments) {
|
|
351
391
|
const typeValue = segment.type;
|
|
352
392
|
if (typeof typeValue === "string" && typeValue === "file") {
|
|
353
393
|
const pathValue = segment.path;
|
|
354
394
|
const textValue = segment.text;
|
|
355
395
|
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
356
396
|
const body = typeof textValue === "string" ? textValue : "";
|
|
357
|
-
|
|
397
|
+
questionParts.push(`=== ${label} ===
|
|
358
398
|
${body}`);
|
|
359
399
|
continue;
|
|
360
400
|
}
|
|
361
401
|
if (typeof typeValue === "string" && typeValue === "text") {
|
|
362
402
|
const value = segment.value;
|
|
363
403
|
if (typeof value === "string") {
|
|
364
|
-
|
|
404
|
+
questionParts.push(value);
|
|
365
405
|
}
|
|
366
406
|
continue;
|
|
367
407
|
}
|
|
368
408
|
const genericValue = segment.value;
|
|
369
409
|
if (typeof genericValue === "string") {
|
|
370
|
-
|
|
410
|
+
questionParts.push(genericValue);
|
|
371
411
|
}
|
|
372
412
|
}
|
|
373
413
|
if (testCase.code_snippets.length > 0) {
|
|
374
|
-
|
|
414
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
375
415
|
}
|
|
376
|
-
const
|
|
416
|
+
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
377
417
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
378
|
-
return {
|
|
418
|
+
return { question, guidelines, systemMessage: testCase.system_message };
|
|
379
419
|
}
|
|
380
420
|
async function fileExists2(absolutePath) {
|
|
381
421
|
try {
|
|
@@ -587,7 +627,7 @@ function buildChatPrompt(request) {
|
|
|
587
627
|
${request.guidelines.trim()}`);
|
|
588
628
|
}
|
|
589
629
|
const systemContent = systemSegments.join("\n\n");
|
|
590
|
-
const userContent = request.
|
|
630
|
+
const userContent = request.question.trim();
|
|
591
631
|
const prompt = [
|
|
592
632
|
{
|
|
593
633
|
role: "system",
|
|
@@ -885,7 +925,7 @@ var CliProvider = class {
|
|
|
885
925
|
healthcheck.commandTemplate,
|
|
886
926
|
buildTemplateValues(
|
|
887
927
|
{
|
|
888
|
-
|
|
928
|
+
question: "",
|
|
889
929
|
guidelines: "",
|
|
890
930
|
inputFiles: [],
|
|
891
931
|
evalCaseId: "",
|
|
@@ -912,7 +952,7 @@ var CliProvider = class {
|
|
|
912
952
|
function buildTemplateValues(request, config) {
|
|
913
953
|
const inputFiles = normalizeInputFiles(request.inputFiles);
|
|
914
954
|
return {
|
|
915
|
-
PROMPT: shellEscape(request.
|
|
955
|
+
PROMPT: shellEscape(request.question ?? ""),
|
|
916
956
|
GUIDELINES: shellEscape(request.guidelines ?? ""),
|
|
917
957
|
EVAL_ID: shellEscape(request.evalCaseId ?? ""),
|
|
918
958
|
ATTEMPT: shellEscape(String(request.attempt ?? 0)),
|
|
@@ -971,11 +1011,64 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
971
1011
|
import { exec as execCallback, spawn } from "node:child_process";
|
|
972
1012
|
import { randomUUID } from "node:crypto";
|
|
973
1013
|
import { constants as constants2, createWriteStream } from "node:fs";
|
|
974
|
-
import { access as access2,
|
|
1014
|
+
import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
|
|
975
1015
|
import { tmpdir } from "node:os";
|
|
976
1016
|
import path4 from "node:path";
|
|
977
1017
|
import { promisify as promisify2 } from "node:util";
|
|
978
1018
|
|
|
1019
|
+
// src/evaluation/providers/codex-log-tracker.ts
|
|
1020
|
+
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1021
|
+
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1022
|
+
function getCodexLogStore() {
|
|
1023
|
+
const globalObject = globalThis;
|
|
1024
|
+
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1025
|
+
if (existing) {
|
|
1026
|
+
return existing;
|
|
1027
|
+
}
|
|
1028
|
+
const created = [];
|
|
1029
|
+
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1030
|
+
return created;
|
|
1031
|
+
}
|
|
1032
|
+
function getSubscriberStore() {
|
|
1033
|
+
const globalObject = globalThis;
|
|
1034
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1035
|
+
if (existing) {
|
|
1036
|
+
return existing;
|
|
1037
|
+
}
|
|
1038
|
+
const created = /* @__PURE__ */ new Set();
|
|
1039
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1040
|
+
return created;
|
|
1041
|
+
}
|
|
1042
|
+
function notifySubscribers(entry) {
|
|
1043
|
+
const subscribers = Array.from(getSubscriberStore());
|
|
1044
|
+
for (const listener of subscribers) {
|
|
1045
|
+
try {
|
|
1046
|
+
listener(entry);
|
|
1047
|
+
} catch (error) {
|
|
1048
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1049
|
+
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
function recordCodexLogEntry(entry) {
|
|
1054
|
+
getCodexLogStore().push(entry);
|
|
1055
|
+
notifySubscribers(entry);
|
|
1056
|
+
}
|
|
1057
|
+
function consumeCodexLogEntries() {
|
|
1058
|
+
const store = getCodexLogStore();
|
|
1059
|
+
if (store.length === 0) {
|
|
1060
|
+
return [];
|
|
1061
|
+
}
|
|
1062
|
+
return store.splice(0, store.length);
|
|
1063
|
+
}
|
|
1064
|
+
function subscribeToCodexLogEntries(listener) {
|
|
1065
|
+
const store = getSubscriberStore();
|
|
1066
|
+
store.add(listener);
|
|
1067
|
+
return () => {
|
|
1068
|
+
store.delete(listener);
|
|
1069
|
+
};
|
|
1070
|
+
}
|
|
1071
|
+
|
|
979
1072
|
// src/evaluation/providers/preread.ts
|
|
980
1073
|
import path3 from "node:path";
|
|
981
1074
|
function buildPromptDocument(request, inputFiles, options) {
|
|
@@ -993,7 +1086,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
993
1086
|
if (prereadBlock.length > 0) {
|
|
994
1087
|
parts.push("\n", prereadBlock);
|
|
995
1088
|
}
|
|
996
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
1089
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
997
1090
|
return parts.join("\n").trim();
|
|
998
1091
|
}
|
|
999
1092
|
function normalizeInputFiles2(inputFiles) {
|
|
@@ -1077,64 +1170,10 @@ function pathToFileUri(filePath) {
|
|
|
1077
1170
|
return `file://${normalizedPath}`;
|
|
1078
1171
|
}
|
|
1079
1172
|
|
|
1080
|
-
// src/evaluation/providers/codex-log-tracker.ts
|
|
1081
|
-
var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
|
|
1082
|
-
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
|
|
1083
|
-
function getCodexLogStore() {
|
|
1084
|
-
const globalObject = globalThis;
|
|
1085
|
-
const existing = globalObject[GLOBAL_LOGS_KEY];
|
|
1086
|
-
if (existing) {
|
|
1087
|
-
return existing;
|
|
1088
|
-
}
|
|
1089
|
-
const created = [];
|
|
1090
|
-
globalObject[GLOBAL_LOGS_KEY] = created;
|
|
1091
|
-
return created;
|
|
1092
|
-
}
|
|
1093
|
-
function getSubscriberStore() {
|
|
1094
|
-
const globalObject = globalThis;
|
|
1095
|
-
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
|
|
1096
|
-
if (existing) {
|
|
1097
|
-
return existing;
|
|
1098
|
-
}
|
|
1099
|
-
const created = /* @__PURE__ */ new Set();
|
|
1100
|
-
globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
|
|
1101
|
-
return created;
|
|
1102
|
-
}
|
|
1103
|
-
function notifySubscribers(entry) {
|
|
1104
|
-
const subscribers = Array.from(getSubscriberStore());
|
|
1105
|
-
for (const listener of subscribers) {
|
|
1106
|
-
try {
|
|
1107
|
-
listener(entry);
|
|
1108
|
-
} catch (error) {
|
|
1109
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
1110
|
-
console.warn(`Codex log subscriber failed: ${message}`);
|
|
1111
|
-
}
|
|
1112
|
-
}
|
|
1113
|
-
}
|
|
1114
|
-
function recordCodexLogEntry(entry) {
|
|
1115
|
-
getCodexLogStore().push(entry);
|
|
1116
|
-
notifySubscribers(entry);
|
|
1117
|
-
}
|
|
1118
|
-
function consumeCodexLogEntries() {
|
|
1119
|
-
const store = getCodexLogStore();
|
|
1120
|
-
if (store.length === 0) {
|
|
1121
|
-
return [];
|
|
1122
|
-
}
|
|
1123
|
-
return store.splice(0, store.length);
|
|
1124
|
-
}
|
|
1125
|
-
function subscribeToCodexLogEntries(listener) {
|
|
1126
|
-
const store = getSubscriberStore();
|
|
1127
|
-
store.add(listener);
|
|
1128
|
-
return () => {
|
|
1129
|
-
store.delete(listener);
|
|
1130
|
-
};
|
|
1131
|
-
}
|
|
1132
|
-
|
|
1133
1173
|
// src/evaluation/providers/codex.ts
|
|
1134
1174
|
var execAsync2 = promisify2(execCallback);
|
|
1135
1175
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
1136
1176
|
var PROMPT_FILENAME = "prompt.md";
|
|
1137
|
-
var FILES_DIR = "files";
|
|
1138
1177
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
1139
1178
|
var CodexProvider = class {
|
|
1140
1179
|
id;
|
|
@@ -1157,21 +1196,10 @@ var CodexProvider = class {
|
|
|
1157
1196
|
}
|
|
1158
1197
|
await this.ensureEnvironmentReady();
|
|
1159
1198
|
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
1160
|
-
const originalGuidelines = new Set(
|
|
1161
|
-
collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => path4.resolve(file))
|
|
1162
|
-
);
|
|
1163
1199
|
const workspaceRoot = await this.createWorkspace();
|
|
1164
1200
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1165
1201
|
try {
|
|
1166
|
-
const
|
|
1167
|
-
inputFiles,
|
|
1168
|
-
workspaceRoot,
|
|
1169
|
-
originalGuidelines
|
|
1170
|
-
);
|
|
1171
|
-
const promptContent = buildPromptDocument(request, mirroredInputFiles, {
|
|
1172
|
-
guidelinePatterns: request.guideline_patterns,
|
|
1173
|
-
guidelineOverrides: guidelineMirrors
|
|
1174
|
-
});
|
|
1202
|
+
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1175
1203
|
const promptFile = path4.join(workspaceRoot, PROMPT_FILENAME);
|
|
1176
1204
|
await writeFile(promptFile, promptContent, "utf8");
|
|
1177
1205
|
const args = this.buildCodexArgs();
|
|
@@ -1200,7 +1228,7 @@ var CodexProvider = class {
|
|
|
1200
1228
|
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1201
1229
|
promptFile,
|
|
1202
1230
|
workspace: workspaceRoot,
|
|
1203
|
-
inputFiles
|
|
1231
|
+
inputFiles,
|
|
1204
1232
|
logFile: logger?.filePath
|
|
1205
1233
|
}
|
|
1206
1234
|
};
|
|
@@ -1255,37 +1283,6 @@ var CodexProvider = class {
|
|
|
1255
1283
|
throw error;
|
|
1256
1284
|
}
|
|
1257
1285
|
}
|
|
1258
|
-
async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
|
|
1259
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
1260
|
-
return {
|
|
1261
|
-
mirroredInputFiles: void 0,
|
|
1262
|
-
guidelineMirrors: /* @__PURE__ */ new Set()
|
|
1263
|
-
};
|
|
1264
|
-
}
|
|
1265
|
-
const filesRoot = path4.join(workspaceRoot, FILES_DIR);
|
|
1266
|
-
await mkdir(filesRoot, { recursive: true });
|
|
1267
|
-
const mirrored = [];
|
|
1268
|
-
const guidelineMirrors = /* @__PURE__ */ new Set();
|
|
1269
|
-
const nameCounts = /* @__PURE__ */ new Map();
|
|
1270
|
-
for (const inputFile of inputFiles) {
|
|
1271
|
-
const absoluteSource = path4.resolve(inputFile);
|
|
1272
|
-
const baseName = path4.basename(absoluteSource);
|
|
1273
|
-
const count = nameCounts.get(baseName) ?? 0;
|
|
1274
|
-
nameCounts.set(baseName, count + 1);
|
|
1275
|
-
const finalName = count === 0 ? baseName : `${baseName}.${count}`;
|
|
1276
|
-
const destination = path4.join(filesRoot, finalName);
|
|
1277
|
-
await copyFile(absoluteSource, destination);
|
|
1278
|
-
const resolvedDestination = path4.resolve(destination);
|
|
1279
|
-
mirrored.push(resolvedDestination);
|
|
1280
|
-
if (guidelineOriginals.has(absoluteSource)) {
|
|
1281
|
-
guidelineMirrors.add(resolvedDestination);
|
|
1282
|
-
}
|
|
1283
|
-
}
|
|
1284
|
-
return {
|
|
1285
|
-
mirroredInputFiles: mirrored,
|
|
1286
|
-
guidelineMirrors
|
|
1287
|
-
};
|
|
1288
|
-
}
|
|
1289
1286
|
async createWorkspace() {
|
|
1290
1287
|
return await mkdtemp(path4.join(tmpdir(), WORKSPACE_PREFIX));
|
|
1291
1288
|
}
|
|
@@ -1863,7 +1860,7 @@ var MockProvider = class {
|
|
|
1863
1860
|
return {
|
|
1864
1861
|
text: this.cannedResponse,
|
|
1865
1862
|
raw: {
|
|
1866
|
-
|
|
1863
|
+
question: request.question,
|
|
1867
1864
|
guidelines: request.guidelines
|
|
1868
1865
|
}
|
|
1869
1866
|
};
|
|
@@ -2256,23 +2253,25 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
2256
2253
|
if (trimmed.length === 0) {
|
|
2257
2254
|
return void 0;
|
|
2258
2255
|
}
|
|
2259
|
-
const
|
|
2260
|
-
if (
|
|
2261
|
-
|
|
2262
|
-
|
|
2256
|
+
const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
2257
|
+
if (envVarMatch) {
|
|
2258
|
+
const varName = envVarMatch[1];
|
|
2259
|
+
const envValue = env[varName];
|
|
2260
|
+
if (envValue !== void 0) {
|
|
2261
|
+
if (envValue.trim().length === 0) {
|
|
2262
|
+
throw new Error(`Environment variable '${varName}' for ${description} is empty`);
|
|
2263
|
+
}
|
|
2264
|
+
return envValue;
|
|
2263
2265
|
}
|
|
2264
|
-
|
|
2265
|
-
}
|
|
2266
|
-
const allowLiteral = options?.allowLiteral ?? false;
|
|
2267
|
-
const optionalEnv = options?.optionalEnv ?? false;
|
|
2268
|
-
const looksLikeEnv = isLikelyEnvReference(trimmed);
|
|
2269
|
-
if (looksLikeEnv) {
|
|
2266
|
+
const optionalEnv = options?.optionalEnv ?? false;
|
|
2270
2267
|
if (optionalEnv) {
|
|
2271
2268
|
return void 0;
|
|
2272
2269
|
}
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2270
|
+
throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
|
|
2271
|
+
}
|
|
2272
|
+
const allowLiteral = options?.allowLiteral ?? false;
|
|
2273
|
+
if (!allowLiteral) {
|
|
2274
|
+
throw new Error(`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`);
|
|
2276
2275
|
}
|
|
2277
2276
|
return trimmed;
|
|
2278
2277
|
}
|
|
@@ -2319,9 +2318,6 @@ function resolveOptionalBoolean(source) {
|
|
|
2319
2318
|
}
|
|
2320
2319
|
throw new Error("expected boolean value");
|
|
2321
2320
|
}
|
|
2322
|
-
function isLikelyEnvReference(value) {
|
|
2323
|
-
return /^[A-Z0-9_]+$/.test(value);
|
|
2324
|
-
}
|
|
2325
2321
|
function resolveOptionalStringArray(source, env, description) {
|
|
2326
2322
|
if (source === void 0 || source === null) {
|
|
2327
2323
|
return void 0;
|
|
@@ -2342,21 +2338,25 @@ function resolveOptionalStringArray(source, env, description) {
|
|
|
2342
2338
|
if (trimmed.length === 0) {
|
|
2343
2339
|
throw new Error(`${description}[${i}] cannot be empty`);
|
|
2344
2340
|
}
|
|
2345
|
-
const
|
|
2346
|
-
if (
|
|
2347
|
-
|
|
2348
|
-
|
|
2341
|
+
const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
2342
|
+
if (envVarMatch) {
|
|
2343
|
+
const varName = envVarMatch[1];
|
|
2344
|
+
const envValue = env[varName];
|
|
2345
|
+
if (envValue !== void 0) {
|
|
2346
|
+
if (envValue.trim().length === 0) {
|
|
2347
|
+
throw new Error(`Environment variable '${varName}' for ${description}[${i}] is empty`);
|
|
2348
|
+
}
|
|
2349
|
+
resolved.push(envValue);
|
|
2350
|
+
continue;
|
|
2349
2351
|
}
|
|
2350
|
-
|
|
2351
|
-
} else {
|
|
2352
|
-
resolved.push(trimmed);
|
|
2352
|
+
throw new Error(`Environment variable '${varName}' for ${description}[${i}] is not set`);
|
|
2353
2353
|
}
|
|
2354
|
+
resolved.push(trimmed);
|
|
2354
2355
|
}
|
|
2355
2356
|
return resolved.length > 0 ? resolved : void 0;
|
|
2356
2357
|
}
|
|
2357
2358
|
|
|
2358
2359
|
// src/evaluation/providers/vscode.ts
|
|
2359
|
-
import { readFile as readFile2 } from "node:fs/promises";
|
|
2360
2360
|
import path5 from "node:path";
|
|
2361
2361
|
import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
|
|
2362
2362
|
var VSCodeProvider = class {
|
|
@@ -2400,7 +2400,7 @@ var VSCodeProvider = class {
|
|
|
2400
2400
|
}
|
|
2401
2401
|
};
|
|
2402
2402
|
}
|
|
2403
|
-
const responseText = await
|
|
2403
|
+
const responseText = await readTextFile(session.responseFile);
|
|
2404
2404
|
return {
|
|
2405
2405
|
text: responseText,
|
|
2406
2406
|
raw: {
|
|
@@ -2454,7 +2454,7 @@ var VSCodeProvider = class {
|
|
|
2454
2454
|
}
|
|
2455
2455
|
const responses = [];
|
|
2456
2456
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
2457
|
-
const responseText = await
|
|
2457
|
+
const responseText = await readTextFile(responseFile);
|
|
2458
2458
|
responses.push({
|
|
2459
2459
|
text: responseText,
|
|
2460
2460
|
raw: {
|
|
@@ -2479,7 +2479,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
2479
2479
|
if (prereadBlock.length > 0) {
|
|
2480
2480
|
parts.push("\n", prereadBlock);
|
|
2481
2481
|
}
|
|
2482
|
-
parts.push("\n[[ ## user_query ## ]]\n", request.
|
|
2482
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
|
|
2483
2483
|
return parts.join("\n").trim();
|
|
2484
2484
|
}
|
|
2485
2485
|
function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
@@ -2604,7 +2604,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
2604
2604
|
|
|
2605
2605
|
// src/evaluation/providers/targets-file.ts
|
|
2606
2606
|
import { constants as constants3 } from "node:fs";
|
|
2607
|
-
import { access as access3, readFile as
|
|
2607
|
+
import { access as access3, readFile as readFile2 } from "node:fs/promises";
|
|
2608
2608
|
import path6 from "node:path";
|
|
2609
2609
|
import { parse as parse2 } from "yaml";
|
|
2610
2610
|
function isRecord(value) {
|
|
@@ -2672,7 +2672,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
2672
2672
|
if (!await fileExists3(absolutePath)) {
|
|
2673
2673
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
2674
2674
|
}
|
|
2675
|
-
const raw = await
|
|
2675
|
+
const raw = await readFile2(absolutePath, "utf8");
|
|
2676
2676
|
const parsed = parse2(raw);
|
|
2677
2677
|
if (!isRecord(parsed)) {
|
|
2678
2678
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
@@ -2716,30 +2716,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2716
2716
|
}
|
|
2717
2717
|
|
|
2718
2718
|
// src/evaluation/evaluators.ts
|
|
2719
|
-
import { ax, f } from "@ax-llm/ax";
|
|
2720
2719
|
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
2721
|
-
var LLM_JUDGE_SIGNATURE = f().input(
|
|
2722
|
-
"evaluationContext",
|
|
2723
|
-
f.object(
|
|
2724
|
-
{
|
|
2725
|
-
expectedOutcome: f.string("The expected outcome for the original task"),
|
|
2726
|
-
request: f.string("The original task request"),
|
|
2727
|
-
referenceAnswer: f.string("The gold standard reference answer"),
|
|
2728
|
-
generatedAnswer: f.string("The answer to evaluate"),
|
|
2729
|
-
guidelines: f.string("Additional evaluation guidelines or instructions").optional()
|
|
2730
|
-
},
|
|
2731
|
-
"Complete evaluation context for the judge"
|
|
2732
|
-
)
|
|
2733
|
-
).output(
|
|
2734
|
-
"evaluation",
|
|
2735
|
-
f.object({
|
|
2736
|
-
score: f.number("Score between 0.0 and 1.0").min(0).max(1),
|
|
2737
|
-
hits: f.string("Brief specific achievement").array(),
|
|
2738
|
-
misses: f.string("Brief specific failure or omission").array(),
|
|
2739
|
-
reasoning: f.string("Concise explanation for the score").max(500)
|
|
2740
|
-
})
|
|
2741
|
-
).build();
|
|
2742
|
-
var LLM_JUDGE = ax(LLM_JUDGE_SIGNATURE);
|
|
2743
2720
|
var LlmJudgeEvaluator = class {
|
|
2744
2721
|
kind = "llm_judge";
|
|
2745
2722
|
resolveJudgeProvider;
|
|
@@ -2757,52 +2734,29 @@ var LlmJudgeEvaluator = class {
|
|
|
2757
2734
|
if (!judgeProvider) {
|
|
2758
2735
|
throw new Error("No judge provider available for LLM grading");
|
|
2759
2736
|
}
|
|
2760
|
-
if (providerSupportsAx(judgeProvider)) {
|
|
2761
|
-
return this.evaluateWithAx(context, judgeProvider);
|
|
2762
|
-
}
|
|
2763
2737
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2764
2738
|
}
|
|
2765
|
-
async evaluateWithAx(context, judgeProvider) {
|
|
2766
|
-
const ai = judgeProvider.getAxAI();
|
|
2767
|
-
const guidelines = context.promptInputs.guidelines?.trim();
|
|
2768
|
-
const evaluationContext = {
|
|
2769
|
-
expectedOutcome: context.evalCase.outcome.trim(),
|
|
2770
|
-
request: context.evalCase.task.trim(),
|
|
2771
|
-
referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
|
|
2772
|
-
generatedAnswer: context.candidate.trim(),
|
|
2773
|
-
...guidelines ? { guidelines } : {}
|
|
2774
|
-
};
|
|
2775
|
-
const options = this.buildJudgeForwardOptions(context);
|
|
2776
|
-
const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
|
|
2777
|
-
const evaluation = result.evaluation;
|
|
2778
|
-
const expectedAspectCount = Math.max(
|
|
2779
|
-
evaluation.hits.length + evaluation.misses.length,
|
|
2780
|
-
1
|
|
2781
|
-
);
|
|
2782
|
-
return {
|
|
2783
|
-
score: evaluation.score,
|
|
2784
|
-
hits: evaluation.hits,
|
|
2785
|
-
misses: evaluation.misses,
|
|
2786
|
-
expectedAspectCount,
|
|
2787
|
-
reasoning: evaluation.reasoning,
|
|
2788
|
-
evaluatorRawRequest: {
|
|
2789
|
-
id: randomUUID2(),
|
|
2790
|
-
provider: judgeProvider.id,
|
|
2791
|
-
target: context.target.name,
|
|
2792
|
-
method: "ax-structured-output",
|
|
2793
|
-
signature: LLM_JUDGE_SIGNATURE.toString()
|
|
2794
|
-
}
|
|
2795
|
-
};
|
|
2796
|
-
}
|
|
2797
2739
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2798
|
-
|
|
2799
|
-
|
|
2740
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
2741
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
|
|
2742
|
+
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2743
|
+
const variables = {
|
|
2744
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2745
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2746
|
+
candidate_answer: context.candidate,
|
|
2747
|
+
reference_answer: context.evalCase.reference_answer,
|
|
2748
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
2749
|
+
question: context.evalCase.question
|
|
2750
|
+
};
|
|
2751
|
+
prompt = substituteVariables(systemPrompt, variables);
|
|
2752
|
+
systemPrompt = QUALITY_SYSTEM_PROMPT;
|
|
2753
|
+
}
|
|
2800
2754
|
const metadata = {
|
|
2801
2755
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2802
2756
|
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2803
2757
|
};
|
|
2804
2758
|
const response = await judgeProvider.invoke({
|
|
2805
|
-
prompt,
|
|
2759
|
+
question: prompt,
|
|
2806
2760
|
metadata,
|
|
2807
2761
|
evalCaseId: context.evalCase.id,
|
|
2808
2762
|
attempt: context.attempt,
|
|
@@ -2832,33 +2786,11 @@ var LlmJudgeEvaluator = class {
|
|
|
2832
2786
|
evaluatorRawRequest
|
|
2833
2787
|
};
|
|
2834
2788
|
}
|
|
2835
|
-
buildJudgeForwardOptions(context) {
|
|
2836
|
-
const modelConfig = this.buildJudgeModelConfig();
|
|
2837
|
-
if (modelConfig === void 0 && context.judgeModel === void 0) {
|
|
2838
|
-
return void 0;
|
|
2839
|
-
}
|
|
2840
|
-
return {
|
|
2841
|
-
...context.judgeModel ? { model: context.judgeModel } : {},
|
|
2842
|
-
...modelConfig ? { modelConfig } : {}
|
|
2843
|
-
};
|
|
2844
|
-
}
|
|
2845
|
-
buildJudgeModelConfig() {
|
|
2846
|
-
if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
|
|
2847
|
-
return void 0;
|
|
2848
|
-
}
|
|
2849
|
-
return {
|
|
2850
|
-
...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
|
|
2851
|
-
...this.temperature !== void 0 ? { temperature: this.temperature } : {}
|
|
2852
|
-
};
|
|
2853
|
-
}
|
|
2854
2789
|
};
|
|
2855
|
-
function providerSupportsAx(provider) {
|
|
2856
|
-
return typeof provider.getAxAI === "function";
|
|
2857
|
-
}
|
|
2858
2790
|
var QUALITY_SYSTEM_PROMPT = [
|
|
2859
|
-
"You are an expert evaluator. Your goal is to grade the
|
|
2791
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2860
2792
|
"",
|
|
2861
|
-
"Use the reference_answer as a gold standard for a high-quality response. The
|
|
2793
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
|
|
2862
2794
|
"",
|
|
2863
2795
|
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2864
2796
|
"",
|
|
@@ -2871,18 +2803,18 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
2871
2803
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2872
2804
|
"}"
|
|
2873
2805
|
].join("\n");
|
|
2874
|
-
function buildQualityPrompt(
|
|
2806
|
+
function buildQualityPrompt(evalCase, candidate) {
|
|
2875
2807
|
const parts = [
|
|
2876
2808
|
"[[ ## expected_outcome ## ]]",
|
|
2877
|
-
|
|
2809
|
+
evalCase.expected_outcome.trim(),
|
|
2878
2810
|
"",
|
|
2879
|
-
"[[ ##
|
|
2880
|
-
|
|
2811
|
+
"[[ ## question ## ]]",
|
|
2812
|
+
evalCase.question.trim(),
|
|
2881
2813
|
"",
|
|
2882
2814
|
"[[ ## reference_answer ## ]]",
|
|
2883
|
-
|
|
2815
|
+
evalCase.reference_answer.trim(),
|
|
2884
2816
|
"",
|
|
2885
|
-
"[[ ##
|
|
2817
|
+
"[[ ## candidate_answer ## ]]",
|
|
2886
2818
|
candidate.trim(),
|
|
2887
2819
|
"",
|
|
2888
2820
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
@@ -2982,14 +2914,14 @@ var CodeEvaluator = class {
|
|
|
2982
2914
|
async evaluate(context) {
|
|
2983
2915
|
const inputPayload = JSON.stringify(
|
|
2984
2916
|
{
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2917
|
+
question: context.evalCase.question,
|
|
2918
|
+
expected_outcome: context.evalCase.expected_outcome,
|
|
2919
|
+
reference_answer: context.evalCase.reference_answer,
|
|
2920
|
+
candidate_answer: context.candidate,
|
|
2989
2921
|
system_message: context.promptInputs.systemMessage ?? "",
|
|
2990
2922
|
guideline_paths: context.evalCase.guideline_paths,
|
|
2991
|
-
|
|
2992
|
-
|
|
2923
|
+
input_files: context.evalCase.file_paths,
|
|
2924
|
+
input_segments: context.evalCase.input_segments
|
|
2993
2925
|
},
|
|
2994
2926
|
null,
|
|
2995
2927
|
2
|
|
@@ -3075,6 +3007,14 @@ function parseJsonSafe(payload) {
|
|
|
3075
3007
|
return void 0;
|
|
3076
3008
|
}
|
|
3077
3009
|
}
|
|
3010
|
+
function hasTemplateVariables(text) {
|
|
3011
|
+
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
3012
|
+
}
|
|
3013
|
+
function substituteVariables(template, variables) {
|
|
3014
|
+
return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
|
|
3015
|
+
return variables[varName] ?? match;
|
|
3016
|
+
});
|
|
3017
|
+
}
|
|
3078
3018
|
|
|
3079
3019
|
// src/evaluation/orchestrator.ts
|
|
3080
3020
|
import { createHash, randomUUID as randomUUID3 } from "node:crypto";
|
|
@@ -3397,7 +3337,8 @@ async function runEvaluation(options) {
|
|
|
3397
3337
|
target.name,
|
|
3398
3338
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
3399
3339
|
outcome.reason,
|
|
3400
|
-
promptInputs
|
|
3340
|
+
promptInputs,
|
|
3341
|
+
primaryProvider
|
|
3401
3342
|
);
|
|
3402
3343
|
results.push(errorResult);
|
|
3403
3344
|
if (onResult) {
|
|
@@ -3431,7 +3372,7 @@ async function runBatchEvaluation(options) {
|
|
|
3431
3372
|
const batchRequests = evalCases.map((evalCase, index) => {
|
|
3432
3373
|
const promptInputs = promptInputsList[index];
|
|
3433
3374
|
return {
|
|
3434
|
-
|
|
3375
|
+
question: promptInputs.question,
|
|
3435
3376
|
guidelines: promptInputs.guidelines,
|
|
3436
3377
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3437
3378
|
inputFiles: evalCase.file_paths,
|
|
@@ -3481,7 +3422,7 @@ async function runBatchEvaluation(options) {
|
|
|
3481
3422
|
agentTimeoutMs
|
|
3482
3423
|
});
|
|
3483
3424
|
} catch (error) {
|
|
3484
|
-
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3425
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3485
3426
|
results.push(errorResult);
|
|
3486
3427
|
if (onResult) {
|
|
3487
3428
|
await onResult(errorResult);
|
|
@@ -3558,7 +3499,7 @@ async function runEvalCase(options) {
|
|
|
3558
3499
|
attempt += 1;
|
|
3559
3500
|
continue;
|
|
3560
3501
|
}
|
|
3561
|
-
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3502
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3562
3503
|
}
|
|
3563
3504
|
}
|
|
3564
3505
|
if (!providerResponse) {
|
|
@@ -3567,7 +3508,8 @@ async function runEvalCase(options) {
|
|
|
3567
3508
|
target.name,
|
|
3568
3509
|
nowFn(),
|
|
3569
3510
|
lastError ?? new Error("Provider did not return a response"),
|
|
3570
|
-
promptInputs
|
|
3511
|
+
promptInputs,
|
|
3512
|
+
provider
|
|
3571
3513
|
);
|
|
3572
3514
|
}
|
|
3573
3515
|
if (cacheKey && cache && !cachedResponse) {
|
|
@@ -3587,7 +3529,7 @@ async function runEvalCase(options) {
|
|
|
3587
3529
|
agentTimeoutMs
|
|
3588
3530
|
});
|
|
3589
3531
|
} catch (error) {
|
|
3590
|
-
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3532
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3591
3533
|
}
|
|
3592
3534
|
}
|
|
3593
3535
|
async function evaluateCandidate(options) {
|
|
@@ -3618,8 +3560,8 @@ async function evaluateCandidate(options) {
|
|
|
3618
3560
|
});
|
|
3619
3561
|
const completedAt = nowFn();
|
|
3620
3562
|
const rawRequest = {
|
|
3621
|
-
|
|
3622
|
-
guidelines: promptInputs.guidelines,
|
|
3563
|
+
question: promptInputs.question,
|
|
3564
|
+
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
3623
3565
|
guideline_paths: evalCase.guideline_paths,
|
|
3624
3566
|
system_message: promptInputs.systemMessage ?? ""
|
|
3625
3567
|
};
|
|
@@ -3630,7 +3572,7 @@ async function evaluateCandidate(options) {
|
|
|
3630
3572
|
score: score.score,
|
|
3631
3573
|
hits: score.hits,
|
|
3632
3574
|
misses: score.misses,
|
|
3633
|
-
|
|
3575
|
+
candidate_answer: candidate,
|
|
3634
3576
|
expected_aspect_count: score.expectedAspectCount,
|
|
3635
3577
|
target: target.name,
|
|
3636
3578
|
timestamp: completedAt.toISOString(),
|
|
@@ -3840,7 +3782,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
|
3840
3782
|
await mkdir2(path7.dirname(filePath), { recursive: true });
|
|
3841
3783
|
const payload = {
|
|
3842
3784
|
eval_id: evalCase.id,
|
|
3843
|
-
|
|
3785
|
+
question: promptInputs.question,
|
|
3844
3786
|
guidelines: promptInputs.guidelines,
|
|
3845
3787
|
guideline_paths: evalCase.guideline_paths
|
|
3846
3788
|
};
|
|
@@ -3862,7 +3804,7 @@ async function invokeProvider(provider, options) {
|
|
|
3862
3804
|
}
|
|
3863
3805
|
try {
|
|
3864
3806
|
return await provider.invoke({
|
|
3865
|
-
|
|
3807
|
+
question: promptInputs.question,
|
|
3866
3808
|
guidelines: promptInputs.guidelines,
|
|
3867
3809
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3868
3810
|
inputFiles: evalCase.file_paths,
|
|
@@ -3879,11 +3821,11 @@ async function invokeProvider(provider, options) {
|
|
|
3879
3821
|
}
|
|
3880
3822
|
}
|
|
3881
3823
|
}
|
|
3882
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
3824
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
3883
3825
|
const message = error instanceof Error ? error.message : String(error);
|
|
3884
3826
|
const rawRequest = {
|
|
3885
|
-
|
|
3886
|
-
guidelines: promptInputs.guidelines,
|
|
3827
|
+
question: promptInputs.question,
|
|
3828
|
+
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
3887
3829
|
guideline_paths: evalCase.guideline_paths,
|
|
3888
3830
|
system_message: promptInputs.systemMessage ?? "",
|
|
3889
3831
|
error: message
|
|
@@ -3895,7 +3837,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
|
|
|
3895
3837
|
score: 0,
|
|
3896
3838
|
hits: [],
|
|
3897
3839
|
misses: [`Error: ${message}`],
|
|
3898
|
-
|
|
3840
|
+
candidate_answer: `Error occurred: ${message}`,
|
|
3899
3841
|
expected_aspect_count: 0,
|
|
3900
3842
|
target: targetName,
|
|
3901
3843
|
timestamp: timestamp.toISOString(),
|
|
@@ -3908,7 +3850,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
3908
3850
|
hash.update(provider.id);
|
|
3909
3851
|
hash.update(target.name);
|
|
3910
3852
|
hash.update(evalCase.id);
|
|
3911
|
-
hash.update(promptInputs.
|
|
3853
|
+
hash.update(promptInputs.question);
|
|
3912
3854
|
hash.update(promptInputs.guidelines);
|
|
3913
3855
|
hash.update(promptInputs.systemMessage ?? "");
|
|
3914
3856
|
return hash.digest("hex");
|