@agentv/core 0.11.0 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/dist/{chunk-YQBJAT5I.js → chunk-IOCVST3R.js} +1 -1
- package/dist/chunk-IOCVST3R.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +912 -747
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +46 -34
- package/dist/index.d.ts +46 -34
- package/dist/index.js +875 -708
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-YQBJAT5I.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -33,15 +33,15 @@ __export(index_exports, {
|
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
35
35
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
36
|
-
buildDirectoryChain: () =>
|
|
36
|
+
buildDirectoryChain: () => buildDirectoryChain2,
|
|
37
37
|
buildPromptInputs: () => buildPromptInputs,
|
|
38
|
-
buildSearchRoots: () =>
|
|
38
|
+
buildSearchRoots: () => buildSearchRoots2,
|
|
39
39
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
40
40
|
createAgentKernel: () => createAgentKernel,
|
|
41
41
|
createProvider: () => createProvider,
|
|
42
42
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
43
43
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
44
|
-
fileExists: () =>
|
|
44
|
+
fileExists: () => fileExists2,
|
|
45
45
|
findGitRoot: () => findGitRoot,
|
|
46
46
|
getHitCount: () => getHitCount,
|
|
47
47
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
@@ -57,7 +57,7 @@ __export(index_exports, {
|
|
|
57
57
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
58
58
|
readTextFile: () => readTextFile,
|
|
59
59
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
60
|
-
resolveFileReference: () =>
|
|
60
|
+
resolveFileReference: () => resolveFileReference2,
|
|
61
61
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
62
62
|
runEvalCase: () => runEvalCase,
|
|
63
63
|
runEvaluation: () => runEvaluation,
|
|
@@ -116,47 +116,112 @@ function getHitCount(result) {
|
|
|
116
116
|
}
|
|
117
117
|
|
|
118
118
|
// src/evaluation/yaml-parser.ts
|
|
119
|
+
var import_promises5 = require("fs/promises");
|
|
120
|
+
var import_node_path6 = __toESM(require("path"), 1);
|
|
121
|
+
var import_yaml2 = require("yaml");
|
|
122
|
+
|
|
123
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
124
|
+
function extractCodeBlocks(segments) {
|
|
125
|
+
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
126
|
+
const codeBlocks = [];
|
|
127
|
+
for (const segment of segments) {
|
|
128
|
+
const typeValue = segment["type"];
|
|
129
|
+
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
130
|
+
continue;
|
|
131
|
+
}
|
|
132
|
+
const textValue = segment["value"];
|
|
133
|
+
if (typeof textValue !== "string") {
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
137
|
+
if (matches) {
|
|
138
|
+
codeBlocks.push(...matches);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
return codeBlocks;
|
|
142
|
+
}
|
|
143
|
+
function formatFileContents(parts) {
|
|
144
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
145
|
+
if (fileCount > 0) {
|
|
146
|
+
return parts.map((part) => {
|
|
147
|
+
if (part.isFile && part.displayPath) {
|
|
148
|
+
return `<file path="${part.displayPath}">
|
|
149
|
+
${part.content}
|
|
150
|
+
</file>`;
|
|
151
|
+
}
|
|
152
|
+
return part.content;
|
|
153
|
+
}).join("\n\n");
|
|
154
|
+
}
|
|
155
|
+
return parts.map((p) => p.content).join(" ");
|
|
156
|
+
}
|
|
157
|
+
function formatSegment(segment) {
|
|
158
|
+
const type = asString(segment.type);
|
|
159
|
+
if (type === "text") {
|
|
160
|
+
return asString(segment.value);
|
|
161
|
+
}
|
|
162
|
+
if (type === "guideline_ref") {
|
|
163
|
+
const refPath = asString(segment.path);
|
|
164
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
165
|
+
}
|
|
166
|
+
if (type === "file") {
|
|
167
|
+
const text = asString(segment.text);
|
|
168
|
+
const filePath = asString(segment.path);
|
|
169
|
+
if (text && filePath) {
|
|
170
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return void 0;
|
|
174
|
+
}
|
|
175
|
+
function hasVisibleContent(segments) {
|
|
176
|
+
return segments.some((segment) => {
|
|
177
|
+
const type = asString(segment.type);
|
|
178
|
+
if (type === "text") {
|
|
179
|
+
const value = asString(segment.value);
|
|
180
|
+
return value !== void 0 && value.trim().length > 0;
|
|
181
|
+
}
|
|
182
|
+
if (type === "guideline_ref") {
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
if (type === "file") {
|
|
186
|
+
const text = asString(segment.text);
|
|
187
|
+
return text !== void 0 && text.trim().length > 0;
|
|
188
|
+
}
|
|
189
|
+
return false;
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
function asString(value) {
|
|
193
|
+
return typeof value === "string" ? value : void 0;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// src/evaluation/loaders/config-loader.ts
|
|
119
197
|
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
120
|
-
var import_node_fs2 = require("fs");
|
|
121
198
|
var import_promises2 = require("fs/promises");
|
|
122
199
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
123
|
-
var import_node_url = require("url");
|
|
124
200
|
var import_yaml = require("yaml");
|
|
125
201
|
|
|
126
|
-
// src/evaluation/file-
|
|
202
|
+
// src/evaluation/loaders/file-resolver.ts
|
|
127
203
|
var import_node_fs = require("fs");
|
|
128
204
|
var import_promises = require("fs/promises");
|
|
129
205
|
var import_node_path = __toESM(require("path"), 1);
|
|
130
|
-
async function fileExists(
|
|
206
|
+
async function fileExists(absolutePath) {
|
|
131
207
|
try {
|
|
132
|
-
await (0, import_promises.access)(
|
|
208
|
+
await (0, import_promises.access)(absolutePath, import_node_fs.constants.F_OK);
|
|
133
209
|
return true;
|
|
134
210
|
} catch {
|
|
135
211
|
return false;
|
|
136
212
|
}
|
|
137
213
|
}
|
|
138
|
-
function
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
async function findGitRoot(startPath) {
|
|
146
|
-
let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
|
|
147
|
-
const root = import_node_path.default.parse(currentDir).root;
|
|
148
|
-
while (currentDir !== root) {
|
|
149
|
-
const gitPath = import_node_path.default.join(currentDir, ".git");
|
|
150
|
-
if (await fileExists(gitPath)) {
|
|
151
|
-
return currentDir;
|
|
152
|
-
}
|
|
153
|
-
const parentDir = import_node_path.default.dirname(currentDir);
|
|
154
|
-
if (parentDir === currentDir) {
|
|
155
|
-
break;
|
|
214
|
+
function resolveToAbsolutePath(candidate) {
|
|
215
|
+
if (candidate instanceof URL) {
|
|
216
|
+
return new URL(candidate).pathname;
|
|
217
|
+
}
|
|
218
|
+
if (typeof candidate === "string") {
|
|
219
|
+
if (candidate.startsWith("file://")) {
|
|
220
|
+
return new URL(candidate).pathname;
|
|
156
221
|
}
|
|
157
|
-
|
|
222
|
+
return import_node_path.default.resolve(candidate);
|
|
158
223
|
}
|
|
159
|
-
|
|
224
|
+
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
160
225
|
}
|
|
161
226
|
function buildDirectoryChain(filePath, repoRoot) {
|
|
162
227
|
const directories = [];
|
|
@@ -234,44 +299,15 @@ async function resolveFileReference(rawValue, searchRoots) {
|
|
|
234
299
|
return { displayPath, attempted };
|
|
235
300
|
}
|
|
236
301
|
|
|
237
|
-
// src/evaluation/
|
|
238
|
-
var
|
|
302
|
+
// src/evaluation/loaders/config-loader.ts
|
|
303
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
239
304
|
var ANSI_YELLOW = "\x1B[33m";
|
|
240
305
|
var ANSI_RESET = "\x1B[0m";
|
|
241
|
-
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
242
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
243
|
-
async function readTestSuiteMetadata(testFilePath) {
|
|
244
|
-
try {
|
|
245
|
-
const absolutePath = import_node_path2.default.resolve(testFilePath);
|
|
246
|
-
const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
|
|
247
|
-
const parsed = (0, import_yaml.parse)(content);
|
|
248
|
-
if (!isJsonObject(parsed)) {
|
|
249
|
-
return {};
|
|
250
|
-
}
|
|
251
|
-
return { target: extractTargetFromSuite(parsed) };
|
|
252
|
-
} catch {
|
|
253
|
-
return {};
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
function extractTargetFromSuite(suite) {
|
|
257
|
-
const execution = suite.execution;
|
|
258
|
-
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
259
|
-
const executionTarget = execution.target;
|
|
260
|
-
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
261
|
-
return executionTarget.trim();
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
const targetValue = suite.target;
|
|
265
|
-
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
266
|
-
return targetValue.trim();
|
|
267
|
-
}
|
|
268
|
-
return void 0;
|
|
269
|
-
}
|
|
270
306
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
271
307
|
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
272
308
|
for (const directory of directories) {
|
|
273
309
|
const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
|
|
274
|
-
if (!await
|
|
310
|
+
if (!await fileExists(configPath)) {
|
|
275
311
|
continue;
|
|
276
312
|
}
|
|
277
313
|
try {
|
|
@@ -313,24 +349,134 @@ function isGuidelineFile(filePath, patterns) {
|
|
|
313
349
|
const patternsToUse = patterns ?? [];
|
|
314
350
|
return import_micromatch.default.isMatch(normalized, patternsToUse);
|
|
315
351
|
}
|
|
316
|
-
function
|
|
317
|
-
const
|
|
318
|
-
|
|
319
|
-
const
|
|
320
|
-
if (typeof
|
|
352
|
+
function extractTargetFromSuite(suite) {
|
|
353
|
+
const execution = suite.execution;
|
|
354
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
355
|
+
const executionTarget = execution.target;
|
|
356
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
357
|
+
return executionTarget.trim();
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
const targetValue = suite.target;
|
|
361
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
362
|
+
return targetValue.trim();
|
|
363
|
+
}
|
|
364
|
+
return void 0;
|
|
365
|
+
}
|
|
366
|
+
function logWarning(message) {
|
|
367
|
+
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
371
|
+
var import_node_path3 = __toESM(require("path"), 1);
|
|
372
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
373
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
374
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
375
|
+
const execution = rawEvalCase.execution;
|
|
376
|
+
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
377
|
+
if (candidateEvaluators === void 0) {
|
|
378
|
+
return void 0;
|
|
379
|
+
}
|
|
380
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
381
|
+
logWarning2(`Skipping evaluators for '${evalId}': expected array`);
|
|
382
|
+
return void 0;
|
|
383
|
+
}
|
|
384
|
+
const evaluators = [];
|
|
385
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
386
|
+
if (!isJsonObject2(rawEvaluator)) {
|
|
387
|
+
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
321
388
|
continue;
|
|
322
389
|
}
|
|
323
|
-
const
|
|
324
|
-
|
|
390
|
+
const name = asString2(rawEvaluator.name);
|
|
391
|
+
const typeValue = rawEvaluator.type;
|
|
392
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
393
|
+
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
325
394
|
continue;
|
|
326
395
|
}
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
396
|
+
if (typeValue === "code") {
|
|
397
|
+
const script = asString2(rawEvaluator.script);
|
|
398
|
+
if (!script) {
|
|
399
|
+
logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
400
|
+
continue;
|
|
401
|
+
}
|
|
402
|
+
const cwd = asString2(rawEvaluator.cwd);
|
|
403
|
+
let resolvedCwd;
|
|
404
|
+
if (cwd) {
|
|
405
|
+
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
406
|
+
if (resolved.resolvedPath) {
|
|
407
|
+
resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
408
|
+
} else {
|
|
409
|
+
logWarning2(
|
|
410
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
411
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
412
|
+
);
|
|
413
|
+
}
|
|
414
|
+
} else {
|
|
415
|
+
resolvedCwd = searchRoots[0];
|
|
416
|
+
}
|
|
417
|
+
evaluators.push({
|
|
418
|
+
name,
|
|
419
|
+
type: "code",
|
|
420
|
+
script,
|
|
421
|
+
cwd,
|
|
422
|
+
resolvedCwd
|
|
423
|
+
});
|
|
424
|
+
continue;
|
|
425
|
+
}
|
|
426
|
+
const prompt = asString2(rawEvaluator.prompt);
|
|
427
|
+
let promptPath;
|
|
428
|
+
if (prompt) {
|
|
429
|
+
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
430
|
+
if (resolved.resolvedPath) {
|
|
431
|
+
promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
432
|
+
} else {
|
|
433
|
+
logWarning2(
|
|
434
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
435
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
436
|
+
);
|
|
437
|
+
}
|
|
330
438
|
}
|
|
439
|
+
const _model = asString2(rawEvaluator.model);
|
|
440
|
+
evaluators.push({
|
|
441
|
+
name,
|
|
442
|
+
type: "llm_judge",
|
|
443
|
+
prompt,
|
|
444
|
+
promptPath
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
448
|
+
}
|
|
449
|
+
function coerceEvaluator(candidate, contextId) {
|
|
450
|
+
if (typeof candidate !== "string") {
|
|
451
|
+
return void 0;
|
|
452
|
+
}
|
|
453
|
+
if (isEvaluatorKind(candidate)) {
|
|
454
|
+
return candidate;
|
|
455
|
+
}
|
|
456
|
+
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
457
|
+
return void 0;
|
|
458
|
+
}
|
|
459
|
+
function asString2(value) {
|
|
460
|
+
return typeof value === "string" ? value : void 0;
|
|
461
|
+
}
|
|
462
|
+
function isJsonObject2(value) {
|
|
463
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
464
|
+
}
|
|
465
|
+
function logWarning2(message, details) {
|
|
466
|
+
if (details && details.length > 0) {
|
|
467
|
+
const detailBlock = details.join("\n");
|
|
468
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}
|
|
469
|
+
${detailBlock}${ANSI_RESET2}`);
|
|
470
|
+
} else {
|
|
471
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
331
472
|
}
|
|
332
|
-
return codeBlocks;
|
|
333
473
|
}
|
|
474
|
+
|
|
475
|
+
// src/evaluation/loaders/message-processor.ts
|
|
476
|
+
var import_promises3 = require("fs/promises");
|
|
477
|
+
var import_node_path4 = __toESM(require("path"), 1);
|
|
478
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
479
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
334
480
|
async function processMessages(options) {
|
|
335
481
|
const {
|
|
336
482
|
messages,
|
|
@@ -356,9 +502,9 @@ async function processMessages(options) {
|
|
|
356
502
|
if (!isJsonObject(rawSegment)) {
|
|
357
503
|
continue;
|
|
358
504
|
}
|
|
359
|
-
const segmentType =
|
|
505
|
+
const segmentType = asString3(rawSegment.type);
|
|
360
506
|
if (segmentType === "file") {
|
|
361
|
-
const rawValue =
|
|
507
|
+
const rawValue = asString3(rawSegment.value);
|
|
362
508
|
if (!rawValue) {
|
|
363
509
|
continue;
|
|
364
510
|
}
|
|
@@ -369,15 +515,15 @@ async function processMessages(options) {
|
|
|
369
515
|
if (!resolvedPath) {
|
|
370
516
|
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
371
517
|
const context = messageType === "input" ? "" : " in expected_messages";
|
|
372
|
-
|
|
518
|
+
logWarning3(`File not found${context}: ${displayPath}`, attempts);
|
|
373
519
|
continue;
|
|
374
520
|
}
|
|
375
521
|
try {
|
|
376
|
-
const fileContent = (await (0,
|
|
522
|
+
const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
377
523
|
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
378
|
-
const relativeToRepo =
|
|
524
|
+
const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
|
|
379
525
|
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
380
|
-
guidelinePaths.push(
|
|
526
|
+
guidelinePaths.push(import_node_path4.default.resolve(resolvedPath));
|
|
381
527
|
if (verbose) {
|
|
382
528
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
383
529
|
console.log(` Resolved to: ${resolvedPath}`);
|
|
@@ -389,7 +535,7 @@ async function processMessages(options) {
|
|
|
389
535
|
type: "file",
|
|
390
536
|
path: displayPath,
|
|
391
537
|
text: fileContent,
|
|
392
|
-
resolvedPath:
|
|
538
|
+
resolvedPath: import_node_path4.default.resolve(resolvedPath)
|
|
393
539
|
});
|
|
394
540
|
if (verbose) {
|
|
395
541
|
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
@@ -398,7 +544,7 @@ async function processMessages(options) {
|
|
|
398
544
|
}
|
|
399
545
|
} catch (error) {
|
|
400
546
|
const context = messageType === "input" ? "" : " expected output";
|
|
401
|
-
|
|
547
|
+
logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
402
548
|
}
|
|
403
549
|
continue;
|
|
404
550
|
}
|
|
@@ -412,201 +558,117 @@ async function processMessages(options) {
|
|
|
412
558
|
}
|
|
413
559
|
return segments;
|
|
414
560
|
}
|
|
415
|
-
async function
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
419
|
-
if (!await fileExists2(absoluteTestPath)) {
|
|
420
|
-
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
421
|
-
}
|
|
422
|
-
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
423
|
-
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
424
|
-
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
425
|
-
const guidelinePatterns = config?.guideline_patterns;
|
|
426
|
-
const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
|
|
427
|
-
const parsed = (0, import_yaml.parse)(rawFile);
|
|
428
|
-
if (!isJsonObject(parsed)) {
|
|
429
|
-
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
430
|
-
}
|
|
431
|
-
const suite = parsed;
|
|
432
|
-
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
433
|
-
const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
434
|
-
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
435
|
-
const schema = suite.$schema;
|
|
436
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
437
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
438
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
439
|
-
throw new Error(message);
|
|
561
|
+
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
562
|
+
if (typeof content === "string") {
|
|
563
|
+
return content;
|
|
440
564
|
}
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
565
|
+
if (!content) {
|
|
566
|
+
return "";
|
|
444
567
|
}
|
|
445
|
-
const
|
|
446
|
-
const
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
for (const rawEvalcase of rawTestcases) {
|
|
450
|
-
if (!isJsonObject(rawEvalcase)) {
|
|
451
|
-
logWarning("Skipping invalid eval case entry (expected object)");
|
|
568
|
+
const parts = [];
|
|
569
|
+
for (const entry of content) {
|
|
570
|
+
if (typeof entry === "string") {
|
|
571
|
+
parts.push({ content: entry, isFile: false });
|
|
452
572
|
continue;
|
|
453
573
|
}
|
|
454
|
-
|
|
455
|
-
const id = asString(evalcase.id);
|
|
456
|
-
if (evalIdFilter && id !== evalIdFilter) {
|
|
574
|
+
if (!isJsonObject(entry)) {
|
|
457
575
|
continue;
|
|
458
576
|
}
|
|
459
|
-
const
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
577
|
+
const segmentType = asString3(entry.type);
|
|
578
|
+
if (segmentType === "file") {
|
|
579
|
+
const rawValue = asString3(entry.value);
|
|
580
|
+
if (!rawValue) {
|
|
581
|
+
continue;
|
|
582
|
+
}
|
|
583
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
584
|
+
rawValue,
|
|
585
|
+
searchRoots
|
|
586
|
+
);
|
|
587
|
+
if (!resolvedPath) {
|
|
588
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
589
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
590
|
+
continue;
|
|
591
|
+
}
|
|
592
|
+
try {
|
|
593
|
+
const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
594
|
+
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
595
|
+
if (verbose) {
|
|
596
|
+
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
597
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
598
|
+
}
|
|
599
|
+
} catch (error) {
|
|
600
|
+
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
601
|
+
}
|
|
465
602
|
continue;
|
|
466
603
|
}
|
|
467
|
-
const
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
471
|
-
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
604
|
+
const textValue = asString3(entry.text);
|
|
605
|
+
if (typeof textValue === "string") {
|
|
606
|
+
parts.push({ content: textValue, isFile: false });
|
|
472
607
|
continue;
|
|
473
608
|
}
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
const inputTextParts = [];
|
|
479
|
-
const inputSegments = await processMessages({
|
|
480
|
-
messages: inputMessages,
|
|
481
|
-
searchRoots,
|
|
482
|
-
repoRootPath,
|
|
483
|
-
guidelinePatterns,
|
|
484
|
-
guidelinePaths,
|
|
485
|
-
textParts: inputTextParts,
|
|
486
|
-
messageType: "input",
|
|
487
|
-
verbose
|
|
488
|
-
});
|
|
489
|
-
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
490
|
-
messages: expectedMessages,
|
|
491
|
-
searchRoots,
|
|
492
|
-
repoRootPath,
|
|
493
|
-
guidelinePatterns,
|
|
494
|
-
messageType: "output",
|
|
495
|
-
verbose
|
|
496
|
-
}) : [];
|
|
497
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
498
|
-
const expectedContent = expectedMessages[0]?.content;
|
|
499
|
-
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
500
|
-
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
501
|
-
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
502
|
-
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
503
|
-
const userFilePaths = [];
|
|
504
|
-
for (const segment of inputSegments) {
|
|
505
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
506
|
-
userFilePaths.push(segment.resolvedPath);
|
|
507
|
-
}
|
|
508
|
-
}
|
|
509
|
-
const allFilePaths = [
|
|
510
|
-
...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
511
|
-
...userFilePaths
|
|
512
|
-
];
|
|
513
|
-
const testCase = {
|
|
514
|
-
id,
|
|
515
|
-
dataset: datasetName,
|
|
516
|
-
conversation_id: conversationId,
|
|
517
|
-
question,
|
|
518
|
-
input_messages: inputMessages,
|
|
519
|
-
input_segments: inputSegments,
|
|
520
|
-
output_segments: outputSegments,
|
|
521
|
-
reference_answer: referenceAnswer,
|
|
522
|
-
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
523
|
-
guideline_patterns: guidelinePatterns,
|
|
524
|
-
file_paths: allFilePaths,
|
|
525
|
-
code_snippets: codeSnippets,
|
|
526
|
-
expected_outcome: outcome,
|
|
527
|
-
evaluator: evalCaseEvaluatorKind,
|
|
528
|
-
evaluators
|
|
529
|
-
};
|
|
530
|
-
if (verbose) {
|
|
531
|
-
console.log(`
|
|
532
|
-
[Eval Case: ${id}]`);
|
|
533
|
-
if (testCase.guideline_paths.length > 0) {
|
|
534
|
-
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
535
|
-
for (const guidelinePath of testCase.guideline_paths) {
|
|
536
|
-
console.log(` - ${guidelinePath}`);
|
|
537
|
-
}
|
|
538
|
-
} else {
|
|
539
|
-
console.log(" No guidelines found");
|
|
540
|
-
}
|
|
609
|
+
const valueValue = asString3(entry.value);
|
|
610
|
+
if (typeof valueValue === "string") {
|
|
611
|
+
parts.push({ content: valueValue, isFile: false });
|
|
612
|
+
continue;
|
|
541
613
|
}
|
|
542
|
-
|
|
614
|
+
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
543
615
|
}
|
|
544
|
-
return
|
|
616
|
+
return formatFileContents(parts);
|
|
545
617
|
}
|
|
546
|
-
function
|
|
547
|
-
|
|
548
|
-
return true;
|
|
549
|
-
}
|
|
550
|
-
let messagesWithContent = 0;
|
|
551
|
-
for (const segments of processedSegmentsByMessage) {
|
|
552
|
-
if (hasVisibleContent(segments)) {
|
|
553
|
-
messagesWithContent++;
|
|
554
|
-
}
|
|
555
|
-
}
|
|
556
|
-
return messagesWithContent > 1;
|
|
618
|
+
function asString3(value) {
|
|
619
|
+
return typeof value === "string" ? value : void 0;
|
|
557
620
|
}
|
|
558
|
-
function
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
if (type === "text") {
|
|
562
|
-
const value = asString(segment.value);
|
|
563
|
-
return value !== void 0 && value.trim().length > 0;
|
|
564
|
-
}
|
|
565
|
-
if (type === "guideline_ref") {
|
|
566
|
-
return false;
|
|
567
|
-
}
|
|
568
|
-
if (type === "file") {
|
|
569
|
-
const text = asString(segment.text);
|
|
570
|
-
return text !== void 0 && text.trim().length > 0;
|
|
571
|
-
}
|
|
572
|
-
return false;
|
|
573
|
-
});
|
|
621
|
+
function cloneJsonObject(source) {
|
|
622
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
623
|
+
return Object.fromEntries(entries);
|
|
574
624
|
}
|
|
575
|
-
function
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
return asString(segment.value);
|
|
625
|
+
function cloneJsonValue(value) {
|
|
626
|
+
if (value === null) {
|
|
627
|
+
return null;
|
|
579
628
|
}
|
|
580
|
-
if (
|
|
581
|
-
|
|
582
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
629
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
630
|
+
return value;
|
|
583
631
|
}
|
|
584
|
-
if (
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
632
|
+
if (Array.isArray(value)) {
|
|
633
|
+
return value.map((item) => cloneJsonValue(item));
|
|
634
|
+
}
|
|
635
|
+
if (typeof value === "object") {
|
|
636
|
+
return cloneJsonObject(value);
|
|
637
|
+
}
|
|
638
|
+
return value;
|
|
639
|
+
}
|
|
640
|
+
function logWarning3(message, details) {
|
|
641
|
+
if (details && details.length > 0) {
|
|
642
|
+
const detailBlock = details.join("\n");
|
|
643
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
644
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
645
|
+
} else {
|
|
646
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
590
647
|
}
|
|
591
|
-
return void 0;
|
|
592
648
|
}
|
|
649
|
+
|
|
650
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
651
|
+
var import_promises4 = require("fs/promises");
|
|
652
|
+
var import_node_path5 = __toESM(require("path"), 1);
|
|
653
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
654
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
593
655
|
async function buildPromptInputs(testCase) {
|
|
594
656
|
const guidelineParts = [];
|
|
595
657
|
for (const rawPath of testCase.guideline_paths) {
|
|
596
|
-
const absolutePath =
|
|
597
|
-
if (!await
|
|
598
|
-
|
|
658
|
+
const absolutePath = import_node_path5.default.resolve(rawPath);
|
|
659
|
+
if (!await fileExists(absolutePath)) {
|
|
660
|
+
logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
599
661
|
continue;
|
|
600
662
|
}
|
|
601
663
|
try {
|
|
602
|
-
const content = (await (0,
|
|
664
|
+
const content = (await (0, import_promises4.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
603
665
|
guidelineParts.push({
|
|
604
666
|
content,
|
|
605
667
|
isFile: true,
|
|
606
|
-
displayPath:
|
|
668
|
+
displayPath: import_node_path5.default.basename(absolutePath)
|
|
607
669
|
});
|
|
608
670
|
} catch (error) {
|
|
609
|
-
|
|
671
|
+
logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
610
672
|
}
|
|
611
673
|
}
|
|
612
674
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -630,9 +692,9 @@ async function buildPromptInputs(testCase) {
|
|
|
630
692
|
messageSegments.push({ type: "text", value: segment });
|
|
631
693
|
}
|
|
632
694
|
} else if (isJsonObject(segment)) {
|
|
633
|
-
const type =
|
|
695
|
+
const type = asString4(segment.type);
|
|
634
696
|
if (type === "file") {
|
|
635
|
-
const value =
|
|
697
|
+
const value = asString4(segment.value);
|
|
636
698
|
if (!value) continue;
|
|
637
699
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
638
700
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -643,7 +705,7 @@ async function buildPromptInputs(testCase) {
|
|
|
643
705
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
644
706
|
}
|
|
645
707
|
} else if (type === "text") {
|
|
646
|
-
const textValue =
|
|
708
|
+
const textValue = asString4(segment.value);
|
|
647
709
|
if (textValue && textValue.trim().length > 0) {
|
|
648
710
|
messageSegments.push({ type: "text", value: textValue });
|
|
649
711
|
}
|
|
@@ -699,6 +761,18 @@ ${messageContent}`);
|
|
|
699
761
|
}) : void 0;
|
|
700
762
|
return { question, guidelines, chatPrompt };
|
|
701
763
|
}
|
|
764
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
765
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
766
|
+
return true;
|
|
767
|
+
}
|
|
768
|
+
let messagesWithContent = 0;
|
|
769
|
+
for (const segments of processedSegmentsByMessage) {
|
|
770
|
+
if (hasVisibleContent(segments)) {
|
|
771
|
+
messagesWithContent++;
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
return messagesWithContent > 1;
|
|
775
|
+
}
|
|
702
776
|
function buildChatPromptFromSegments(options) {
|
|
703
777
|
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
704
778
|
if (messages.length === 0) {
|
|
@@ -740,13 +814,12 @@ ${guidelineContent.trim()}`);
|
|
|
740
814
|
const segments = segmentsByMessage[i];
|
|
741
815
|
const contentParts = [];
|
|
742
816
|
let role = message.role;
|
|
743
|
-
let name;
|
|
744
817
|
if (role === "system") {
|
|
745
818
|
role = "assistant";
|
|
746
819
|
contentParts.push("@[System]:");
|
|
747
820
|
} else if (role === "tool") {
|
|
748
|
-
role = "
|
|
749
|
-
|
|
821
|
+
role = "assistant";
|
|
822
|
+
contentParts.push("@[Tool]:");
|
|
750
823
|
}
|
|
751
824
|
for (const segment of segments) {
|
|
752
825
|
if (segment.type === "guideline_ref") {
|
|
@@ -764,282 +837,509 @@ ${guidelineContent.trim()}`);
|
|
|
764
837
|
if (contentParts.length === 0) {
|
|
765
838
|
continue;
|
|
766
839
|
}
|
|
840
|
+
const content = contentParts.join("\n");
|
|
767
841
|
chatPrompt.push({
|
|
768
842
|
role,
|
|
769
|
-
content
|
|
770
|
-
...name ? { name } : {}
|
|
843
|
+
content
|
|
771
844
|
});
|
|
772
845
|
}
|
|
773
846
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
774
847
|
}
|
|
775
|
-
|
|
848
|
+
function asString4(value) {
|
|
849
|
+
return typeof value === "string" ? value : void 0;
|
|
850
|
+
}
|
|
851
|
+
function logWarning4(message) {
|
|
852
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
// src/evaluation/yaml-parser.ts
|
|
856
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
857
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
858
|
+
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
859
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
776
860
|
try {
|
|
777
|
-
|
|
778
|
-
|
|
861
|
+
const absolutePath = import_node_path6.default.resolve(testFilePath);
|
|
862
|
+
const content = await (0, import_promises5.readFile)(absolutePath, "utf8");
|
|
863
|
+
const parsed = (0, import_yaml2.parse)(content);
|
|
864
|
+
if (!isJsonObject(parsed)) {
|
|
865
|
+
return {};
|
|
866
|
+
}
|
|
867
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
779
868
|
} catch {
|
|
780
|
-
return
|
|
869
|
+
return {};
|
|
781
870
|
}
|
|
782
871
|
}
|
|
783
|
-
function
|
|
784
|
-
|
|
785
|
-
|
|
872
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
873
|
+
const verbose = options?.verbose ?? false;
|
|
874
|
+
const evalIdFilter = options?.evalId;
|
|
875
|
+
const absoluteTestPath = import_node_path6.default.resolve(evalFilePath);
|
|
876
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
877
|
+
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
878
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
879
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
880
|
+
const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
|
|
881
|
+
const parsed = (0, import_yaml2.parse)(rawFile);
|
|
882
|
+
if (!isJsonObject(parsed)) {
|
|
883
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
786
884
|
}
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
885
|
+
const suite = parsed;
|
|
886
|
+
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
887
|
+
const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
888
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
889
|
+
const schema = suite.$schema;
|
|
890
|
+
if (schema !== SCHEMA_EVAL_V2) {
|
|
891
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
892
|
+
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
893
|
+
throw new Error(message);
|
|
894
|
+
}
|
|
895
|
+
const rawTestcases = suite.evalcases;
|
|
896
|
+
if (!Array.isArray(rawTestcases)) {
|
|
897
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
898
|
+
}
|
|
899
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
900
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
901
|
+
const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
|
|
902
|
+
const results = [];
|
|
903
|
+
for (const rawEvalcase of rawTestcases) {
|
|
904
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
905
|
+
logWarning5("Skipping invalid eval case entry (expected object)");
|
|
906
|
+
continue;
|
|
790
907
|
}
|
|
791
|
-
|
|
908
|
+
const evalcase = rawEvalcase;
|
|
909
|
+
const id = asString5(evalcase.id);
|
|
910
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
911
|
+
continue;
|
|
912
|
+
}
|
|
913
|
+
const conversationId = asString5(evalcase.conversation_id);
|
|
914
|
+
const outcome = asString5(evalcase.outcome);
|
|
915
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
916
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
917
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
918
|
+
logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
919
|
+
continue;
|
|
920
|
+
}
|
|
921
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
922
|
+
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
923
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
924
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
925
|
+
logWarning5(`No valid expected message found for eval case: ${id}`);
|
|
926
|
+
continue;
|
|
927
|
+
}
|
|
928
|
+
if (expectedMessages.length > 1) {
|
|
929
|
+
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
930
|
+
}
|
|
931
|
+
const guidelinePaths = [];
|
|
932
|
+
const inputTextParts = [];
|
|
933
|
+
const inputSegments = await processMessages({
|
|
934
|
+
messages: inputMessages,
|
|
935
|
+
searchRoots,
|
|
936
|
+
repoRootPath,
|
|
937
|
+
guidelinePatterns,
|
|
938
|
+
guidelinePaths,
|
|
939
|
+
textParts: inputTextParts,
|
|
940
|
+
messageType: "input",
|
|
941
|
+
verbose
|
|
942
|
+
});
|
|
943
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
944
|
+
messages: expectedMessages,
|
|
945
|
+
searchRoots,
|
|
946
|
+
repoRootPath,
|
|
947
|
+
guidelinePatterns,
|
|
948
|
+
messageType: "output",
|
|
949
|
+
verbose
|
|
950
|
+
}) : [];
|
|
951
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
952
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
953
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
954
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
955
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
956
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
957
|
+
const userFilePaths = [];
|
|
958
|
+
for (const segment of inputSegments) {
|
|
959
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
960
|
+
userFilePaths.push(segment.resolvedPath);
|
|
961
|
+
}
|
|
962
|
+
}
|
|
963
|
+
const allFilePaths = [
|
|
964
|
+
...guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
965
|
+
...userFilePaths
|
|
966
|
+
];
|
|
967
|
+
const testCase = {
|
|
968
|
+
id,
|
|
969
|
+
dataset: datasetName,
|
|
970
|
+
conversation_id: conversationId,
|
|
971
|
+
question,
|
|
972
|
+
input_messages: inputMessages,
|
|
973
|
+
input_segments: inputSegments,
|
|
974
|
+
output_segments: outputSegments,
|
|
975
|
+
reference_answer: referenceAnswer,
|
|
976
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
977
|
+
guideline_patterns: guidelinePatterns,
|
|
978
|
+
file_paths: allFilePaths,
|
|
979
|
+
code_snippets: codeSnippets,
|
|
980
|
+
expected_outcome: outcome,
|
|
981
|
+
evaluator: evalCaseEvaluatorKind,
|
|
982
|
+
evaluators
|
|
983
|
+
};
|
|
984
|
+
if (verbose) {
|
|
985
|
+
console.log(`
|
|
986
|
+
[Eval Case: ${id}]`);
|
|
987
|
+
if (testCase.guideline_paths.length > 0) {
|
|
988
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
989
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
990
|
+
console.log(` - ${guidelinePath}`);
|
|
991
|
+
}
|
|
992
|
+
} else {
|
|
993
|
+
console.log(" No guidelines found");
|
|
994
|
+
}
|
|
995
|
+
}
|
|
996
|
+
results.push(testCase);
|
|
792
997
|
}
|
|
793
|
-
|
|
998
|
+
return results;
|
|
794
999
|
}
|
|
795
|
-
function
|
|
1000
|
+
function asString5(value) {
|
|
796
1001
|
return typeof value === "string" ? value : void 0;
|
|
797
1002
|
}
|
|
798
|
-
function
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
}
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
}
|
|
806
|
-
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
807
|
-
return value;
|
|
1003
|
+
function logWarning5(message, details) {
|
|
1004
|
+
if (details && details.length > 0) {
|
|
1005
|
+
const detailBlock = details.join("\n");
|
|
1006
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
1007
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1008
|
+
} else {
|
|
1009
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
808
1010
|
}
|
|
809
|
-
|
|
810
|
-
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
// src/evaluation/file-utils.ts
|
|
1014
|
+
var import_node_fs2 = require("fs");
|
|
1015
|
+
var import_promises6 = require("fs/promises");
|
|
1016
|
+
var import_node_path7 = __toESM(require("path"), 1);
|
|
1017
|
+
async function fileExists2(filePath) {
|
|
1018
|
+
try {
|
|
1019
|
+
await (0, import_promises6.access)(filePath, import_node_fs2.constants.F_OK);
|
|
1020
|
+
return true;
|
|
1021
|
+
} catch {
|
|
1022
|
+
return false;
|
|
811
1023
|
}
|
|
812
|
-
return cloneJsonObject(value);
|
|
813
1024
|
}
|
|
814
|
-
function
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
1025
|
+
function normalizeLineEndings(content) {
|
|
1026
|
+
return content.replace(/\r\n/g, "\n");
|
|
1027
|
+
}
|
|
1028
|
+
async function readTextFile(filePath) {
|
|
1029
|
+
const content = await (0, import_promises6.readFile)(filePath, "utf8");
|
|
1030
|
+
return normalizeLineEndings(content);
|
|
1031
|
+
}
|
|
1032
|
+
async function findGitRoot(startPath) {
|
|
1033
|
+
let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
|
|
1034
|
+
const root = import_node_path7.default.parse(currentDir).root;
|
|
1035
|
+
while (currentDir !== root) {
|
|
1036
|
+
const gitPath = import_node_path7.default.join(currentDir, ".git");
|
|
1037
|
+
if (await fileExists2(gitPath)) {
|
|
1038
|
+
return currentDir;
|
|
1039
|
+
}
|
|
1040
|
+
const parentDir = import_node_path7.default.dirname(currentDir);
|
|
1041
|
+
if (parentDir === currentDir) {
|
|
1042
|
+
break;
|
|
1043
|
+
}
|
|
1044
|
+
currentDir = parentDir;
|
|
825
1045
|
}
|
|
826
|
-
return
|
|
1046
|
+
return null;
|
|
827
1047
|
}
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
1048
|
+
function buildDirectoryChain2(filePath, repoRoot) {
|
|
1049
|
+
const directories = [];
|
|
1050
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1051
|
+
const boundary = import_node_path7.default.resolve(repoRoot);
|
|
1052
|
+
let current = import_node_path7.default.resolve(import_node_path7.default.dirname(filePath));
|
|
1053
|
+
while (current !== void 0) {
|
|
1054
|
+
if (!seen.has(current)) {
|
|
1055
|
+
directories.push(current);
|
|
1056
|
+
seen.add(current);
|
|
1057
|
+
}
|
|
1058
|
+
if (current === boundary) {
|
|
1059
|
+
break;
|
|
1060
|
+
}
|
|
1061
|
+
const parent = import_node_path7.default.dirname(current);
|
|
1062
|
+
if (parent === current) {
|
|
1063
|
+
break;
|
|
1064
|
+
}
|
|
1065
|
+
current = parent;
|
|
831
1066
|
}
|
|
832
|
-
if (!
|
|
833
|
-
|
|
1067
|
+
if (!seen.has(boundary)) {
|
|
1068
|
+
directories.push(boundary);
|
|
834
1069
|
}
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
if (!
|
|
842
|
-
|
|
1070
|
+
return directories;
|
|
1071
|
+
}
|
|
1072
|
+
function buildSearchRoots2(evalPath, repoRoot) {
|
|
1073
|
+
const uniqueRoots = [];
|
|
1074
|
+
const addRoot = (root) => {
|
|
1075
|
+
const normalized = import_node_path7.default.resolve(root);
|
|
1076
|
+
if (!uniqueRoots.includes(normalized)) {
|
|
1077
|
+
uniqueRoots.push(normalized);
|
|
843
1078
|
}
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
if (!resolvedPath) {
|
|
855
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
856
|
-
logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
857
|
-
continue;
|
|
858
|
-
}
|
|
859
|
-
try {
|
|
860
|
-
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
861
|
-
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
862
|
-
if (verbose) {
|
|
863
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
864
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
865
|
-
}
|
|
866
|
-
} catch (error) {
|
|
867
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
868
|
-
}
|
|
869
|
-
continue;
|
|
1079
|
+
};
|
|
1080
|
+
let currentDir = import_node_path7.default.dirname(evalPath);
|
|
1081
|
+
let reachedBoundary = false;
|
|
1082
|
+
while (!reachedBoundary) {
|
|
1083
|
+
addRoot(currentDir);
|
|
1084
|
+
const parentDir = import_node_path7.default.dirname(currentDir);
|
|
1085
|
+
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
1086
|
+
reachedBoundary = true;
|
|
1087
|
+
} else {
|
|
1088
|
+
currentDir = parentDir;
|
|
870
1089
|
}
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
1090
|
+
}
|
|
1091
|
+
addRoot(repoRoot);
|
|
1092
|
+
addRoot(process.cwd());
|
|
1093
|
+
return uniqueRoots;
|
|
1094
|
+
}
|
|
1095
|
+
function trimLeadingSeparators2(value) {
|
|
1096
|
+
const trimmed = value.replace(/^[/\\]+/, "");
|
|
1097
|
+
return trimmed.length > 0 ? trimmed : value;
|
|
1098
|
+
}
|
|
1099
|
+
async function resolveFileReference2(rawValue, searchRoots) {
|
|
1100
|
+
const displayPath = trimLeadingSeparators2(rawValue);
|
|
1101
|
+
const potentialPaths = [];
|
|
1102
|
+
if (import_node_path7.default.isAbsolute(rawValue)) {
|
|
1103
|
+
potentialPaths.push(import_node_path7.default.normalize(rawValue));
|
|
1104
|
+
}
|
|
1105
|
+
for (const base of searchRoots) {
|
|
1106
|
+
potentialPaths.push(import_node_path7.default.resolve(base, displayPath));
|
|
1107
|
+
}
|
|
1108
|
+
const attempted = [];
|
|
1109
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1110
|
+
for (const candidate of potentialPaths) {
|
|
1111
|
+
const absoluteCandidate = import_node_path7.default.resolve(candidate);
|
|
1112
|
+
if (seen.has(absoluteCandidate)) {
|
|
874
1113
|
continue;
|
|
875
1114
|
}
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
1115
|
+
seen.add(absoluteCandidate);
|
|
1116
|
+
attempted.push(absoluteCandidate);
|
|
1117
|
+
if (await fileExists2(absoluteCandidate)) {
|
|
1118
|
+
return { displayPath, resolvedPath: absoluteCandidate, attempted };
|
|
880
1119
|
}
|
|
881
|
-
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
882
1120
|
}
|
|
883
|
-
return
|
|
1121
|
+
return { displayPath, attempted };
|
|
884
1122
|
}
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
1123
|
+
|
|
1124
|
+
// src/evaluation/providers/ai-sdk.ts
|
|
1125
|
+
var import_anthropic = require("@ai-sdk/anthropic");
|
|
1126
|
+
var import_azure = require("@ai-sdk/azure");
|
|
1127
|
+
var import_google = require("@ai-sdk/google");
|
|
1128
|
+
var import_ai = require("ai");
|
|
1129
|
+
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
1130
|
+
var AzureProvider = class {
|
|
1131
|
+
constructor(targetName, config) {
|
|
1132
|
+
this.config = config;
|
|
1133
|
+
this.id = `azure:${targetName}`;
|
|
1134
|
+
this.targetName = targetName;
|
|
1135
|
+
this.defaults = {
|
|
1136
|
+
temperature: config.temperature,
|
|
1137
|
+
maxOutputTokens: config.maxOutputTokens
|
|
1138
|
+
};
|
|
1139
|
+
this.retryConfig = config.retry;
|
|
1140
|
+
const azure = (0, import_azure.createAzure)(buildAzureOptions(config));
|
|
1141
|
+
this.model = azure(config.deploymentName);
|
|
890
1142
|
}
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
1143
|
+
id;
|
|
1144
|
+
kind = "azure";
|
|
1145
|
+
targetName;
|
|
1146
|
+
model;
|
|
1147
|
+
defaults;
|
|
1148
|
+
retryConfig;
|
|
1149
|
+
async invoke(request) {
|
|
1150
|
+
return invokeModel({
|
|
1151
|
+
model: this.model,
|
|
1152
|
+
request,
|
|
1153
|
+
defaults: this.defaults,
|
|
1154
|
+
retryConfig: this.retryConfig
|
|
1155
|
+
});
|
|
1156
|
+
}
|
|
1157
|
+
};
|
|
1158
|
+
var AnthropicProvider = class {
|
|
1159
|
+
constructor(targetName, config) {
|
|
1160
|
+
this.config = config;
|
|
1161
|
+
this.id = `anthropic:${targetName}`;
|
|
1162
|
+
this.targetName = targetName;
|
|
1163
|
+
this.defaults = {
|
|
1164
|
+
temperature: config.temperature,
|
|
1165
|
+
maxOutputTokens: config.maxOutputTokens,
|
|
1166
|
+
thinkingBudget: config.thinkingBudget
|
|
1167
|
+
};
|
|
1168
|
+
this.retryConfig = config.retry;
|
|
1169
|
+
const anthropic = (0, import_anthropic.createAnthropic)({
|
|
1170
|
+
apiKey: config.apiKey
|
|
1171
|
+
});
|
|
1172
|
+
this.model = anthropic(config.model);
|
|
1173
|
+
}
|
|
1174
|
+
id;
|
|
1175
|
+
kind = "anthropic";
|
|
1176
|
+
targetName;
|
|
1177
|
+
model;
|
|
1178
|
+
defaults;
|
|
1179
|
+
retryConfig;
|
|
1180
|
+
async invoke(request) {
|
|
1181
|
+
const providerOptions = buildAnthropicProviderOptions(this.defaults);
|
|
1182
|
+
return invokeModel({
|
|
1183
|
+
model: this.model,
|
|
1184
|
+
request,
|
|
1185
|
+
defaults: this.defaults,
|
|
1186
|
+
retryConfig: this.retryConfig,
|
|
1187
|
+
providerOptions
|
|
1188
|
+
});
|
|
894
1189
|
}
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
const script = asString(rawEvaluator.script);
|
|
909
|
-
if (!script) {
|
|
910
|
-
logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
911
|
-
continue;
|
|
912
|
-
}
|
|
913
|
-
const cwd = asString(rawEvaluator.cwd);
|
|
914
|
-
let resolvedCwd;
|
|
915
|
-
if (cwd) {
|
|
916
|
-
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
917
|
-
if (resolved.resolvedPath) {
|
|
918
|
-
resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
919
|
-
} else {
|
|
920
|
-
logWarning(
|
|
921
|
-
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
922
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
923
|
-
);
|
|
924
|
-
}
|
|
925
|
-
} else {
|
|
926
|
-
resolvedCwd = searchRoots[0];
|
|
927
|
-
}
|
|
928
|
-
evaluators.push({
|
|
929
|
-
name,
|
|
930
|
-
type: "code",
|
|
931
|
-
script,
|
|
932
|
-
cwd,
|
|
933
|
-
resolvedCwd
|
|
934
|
-
});
|
|
935
|
-
continue;
|
|
936
|
-
}
|
|
937
|
-
const prompt = asString(rawEvaluator.prompt);
|
|
938
|
-
let promptPath;
|
|
939
|
-
if (prompt) {
|
|
940
|
-
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
941
|
-
if (resolved.resolvedPath) {
|
|
942
|
-
promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
943
|
-
} else {
|
|
944
|
-
logWarning(
|
|
945
|
-
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
946
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
947
|
-
);
|
|
948
|
-
}
|
|
949
|
-
}
|
|
950
|
-
const model = asString(rawEvaluator.model);
|
|
951
|
-
evaluators.push({
|
|
952
|
-
name,
|
|
953
|
-
type: "llm_judge",
|
|
954
|
-
prompt,
|
|
955
|
-
promptPath
|
|
1190
|
+
};
|
|
1191
|
+
var GeminiProvider = class {
|
|
1192
|
+
constructor(targetName, config) {
|
|
1193
|
+
this.config = config;
|
|
1194
|
+
this.id = `gemini:${targetName}`;
|
|
1195
|
+
this.targetName = targetName;
|
|
1196
|
+
this.defaults = {
|
|
1197
|
+
temperature: config.temperature,
|
|
1198
|
+
maxOutputTokens: config.maxOutputTokens
|
|
1199
|
+
};
|
|
1200
|
+
this.retryConfig = config.retry;
|
|
1201
|
+
const google = (0, import_google.createGoogleGenerativeAI)({
|
|
1202
|
+
apiKey: config.apiKey
|
|
956
1203
|
});
|
|
1204
|
+
this.model = google(config.model);
|
|
957
1205
|
}
|
|
958
|
-
|
|
1206
|
+
id;
|
|
1207
|
+
kind = "gemini";
|
|
1208
|
+
targetName;
|
|
1209
|
+
model;
|
|
1210
|
+
defaults;
|
|
1211
|
+
retryConfig;
|
|
1212
|
+
async invoke(request) {
|
|
1213
|
+
return invokeModel({
|
|
1214
|
+
model: this.model,
|
|
1215
|
+
request,
|
|
1216
|
+
defaults: this.defaults,
|
|
1217
|
+
retryConfig: this.retryConfig
|
|
1218
|
+
});
|
|
1219
|
+
}
|
|
1220
|
+
};
|
|
1221
|
+
function buildAzureOptions(config) {
|
|
1222
|
+
const options = {
|
|
1223
|
+
apiKey: config.apiKey,
|
|
1224
|
+
apiVersion: config.version,
|
|
1225
|
+
useDeploymentBasedUrls: true
|
|
1226
|
+
};
|
|
1227
|
+
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
1228
|
+
if (baseURL) {
|
|
1229
|
+
options.baseURL = baseURL;
|
|
1230
|
+
} else {
|
|
1231
|
+
options.resourceName = config.resourceName;
|
|
1232
|
+
}
|
|
1233
|
+
return options;
|
|
959
1234
|
}
|
|
960
|
-
function
|
|
961
|
-
|
|
1235
|
+
function normalizeAzureBaseUrl(resourceName) {
|
|
1236
|
+
const trimmed = resourceName.trim();
|
|
1237
|
+
if (!/^https?:\/\//i.test(trimmed)) {
|
|
962
1238
|
return void 0;
|
|
963
1239
|
}
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
968
|
-
return void 0;
|
|
1240
|
+
const withoutSlash = trimmed.replace(/\/+$/, "");
|
|
1241
|
+
const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
|
|
1242
|
+
return normalized;
|
|
969
1243
|
}
|
|
970
|
-
function
|
|
971
|
-
if (
|
|
972
|
-
|
|
973
|
-
console.warn(`${ANSI_YELLOW}Warning: ${message}
|
|
974
|
-
${detailBlock}${ANSI_RESET}`);
|
|
975
|
-
} else {
|
|
976
|
-
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
1244
|
+
function buildAnthropicProviderOptions(defaults) {
|
|
1245
|
+
if (defaults.thinkingBudget === void 0) {
|
|
1246
|
+
return void 0;
|
|
977
1247
|
}
|
|
1248
|
+
return {
|
|
1249
|
+
anthropic: {
|
|
1250
|
+
thinking: {
|
|
1251
|
+
type: "enabled",
|
|
1252
|
+
budgetTokens: defaults.thinkingBudget
|
|
1253
|
+
}
|
|
1254
|
+
}
|
|
1255
|
+
};
|
|
978
1256
|
}
|
|
979
|
-
|
|
980
|
-
// src/evaluation/providers/ax.ts
|
|
981
|
-
var import_ax = require("@ax-llm/ax");
|
|
982
|
-
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
983
1257
|
function buildChatPrompt(request) {
|
|
984
|
-
|
|
985
|
-
|
|
1258
|
+
const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
|
|
1259
|
+
if (provided) {
|
|
1260
|
+
const hasSystemMessage = provided.some((message) => message.role === "system");
|
|
986
1261
|
if (hasSystemMessage) {
|
|
987
|
-
return
|
|
1262
|
+
return provided;
|
|
988
1263
|
}
|
|
989
|
-
const systemContent2 = resolveSystemContent(request);
|
|
990
|
-
return [{ role: "system", content: systemContent2 }, ...
|
|
1264
|
+
const systemContent2 = resolveSystemContent(request, false);
|
|
1265
|
+
return [{ role: "system", content: systemContent2 }, ...provided];
|
|
991
1266
|
}
|
|
992
|
-
const systemContent = resolveSystemContent(request);
|
|
1267
|
+
const systemContent = resolveSystemContent(request, true);
|
|
993
1268
|
const userContent = request.question.trim();
|
|
994
1269
|
const prompt = [
|
|
995
|
-
{
|
|
996
|
-
|
|
997
|
-
content: systemContent
|
|
998
|
-
},
|
|
999
|
-
{
|
|
1000
|
-
role: "user",
|
|
1001
|
-
content: userContent
|
|
1002
|
-
}
|
|
1270
|
+
{ role: "system", content: systemContent },
|
|
1271
|
+
{ role: "user", content: userContent }
|
|
1003
1272
|
];
|
|
1004
1273
|
return prompt;
|
|
1005
1274
|
}
|
|
1006
|
-
function resolveSystemContent(request) {
|
|
1275
|
+
function resolveSystemContent(request, includeGuidelines) {
|
|
1007
1276
|
const systemSegments = [];
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
systemSegments.push(metadataSystemPrompt.trim());
|
|
1277
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
1278
|
+
systemSegments.push(request.systemPrompt.trim());
|
|
1011
1279
|
} else {
|
|
1012
1280
|
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
1013
1281
|
}
|
|
1014
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1282
|
+
if (includeGuidelines && request.guidelines && request.guidelines.trim().length > 0) {
|
|
1015
1283
|
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
1016
1284
|
|
|
1017
1285
|
${request.guidelines.trim()}`);
|
|
1018
1286
|
}
|
|
1019
1287
|
return systemSegments.join("\n\n");
|
|
1020
1288
|
}
|
|
1021
|
-
function
|
|
1289
|
+
function toModelMessages(chatPrompt) {
|
|
1290
|
+
return chatPrompt.map((message) => {
|
|
1291
|
+
if (message.role === "tool" || message.role === "function") {
|
|
1292
|
+
const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
|
|
1293
|
+
return {
|
|
1294
|
+
role: "assistant",
|
|
1295
|
+
content: `${prefix}${message.content}`
|
|
1296
|
+
};
|
|
1297
|
+
}
|
|
1298
|
+
if (message.role === "assistant" || message.role === "system" || message.role === "user") {
|
|
1299
|
+
return {
|
|
1300
|
+
role: message.role,
|
|
1301
|
+
content: message.content
|
|
1302
|
+
};
|
|
1303
|
+
}
|
|
1304
|
+
return {
|
|
1305
|
+
role: "user",
|
|
1306
|
+
content: message.content
|
|
1307
|
+
};
|
|
1308
|
+
});
|
|
1309
|
+
}
|
|
1310
|
+
function resolveModelSettings(request, defaults) {
|
|
1022
1311
|
const temperature = request.temperature ?? defaults.temperature;
|
|
1023
|
-
const
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
}
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
}
|
|
1031
|
-
|
|
1312
|
+
const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
1313
|
+
return {
|
|
1314
|
+
temperature,
|
|
1315
|
+
maxOutputTokens
|
|
1316
|
+
};
|
|
1317
|
+
}
|
|
1318
|
+
async function invokeModel(options) {
|
|
1319
|
+
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
1320
|
+
const chatPrompt = buildChatPrompt(request);
|
|
1321
|
+
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
1322
|
+
const result = await withRetry(
|
|
1323
|
+
() => (0, import_ai.generateText)({
|
|
1324
|
+
model,
|
|
1325
|
+
messages: toModelMessages(chatPrompt),
|
|
1326
|
+
temperature,
|
|
1327
|
+
maxOutputTokens,
|
|
1328
|
+
maxRetries: 0,
|
|
1329
|
+
abortSignal: request.signal,
|
|
1330
|
+
...providerOptions ? { providerOptions } : {}
|
|
1331
|
+
}),
|
|
1332
|
+
retryConfig,
|
|
1333
|
+
request.signal
|
|
1334
|
+
);
|
|
1335
|
+
return mapResponse(result);
|
|
1032
1336
|
}
|
|
1033
|
-
function mapResponse(
|
|
1034
|
-
const primary = response.results[0];
|
|
1035
|
-
const text = typeof primary?.content === "string" ? primary.content : "";
|
|
1036
|
-
const reasoning = primary?.thought ?? primary?.thoughtBlock?.data;
|
|
1037
|
-
const usage = toJsonObject(response.modelUsage);
|
|
1337
|
+
function mapResponse(result) {
|
|
1038
1338
|
return {
|
|
1039
|
-
text,
|
|
1040
|
-
reasoning,
|
|
1041
|
-
raw:
|
|
1042
|
-
usage
|
|
1339
|
+
text: result.text ?? "",
|
|
1340
|
+
reasoning: result.reasoningText ?? void 0,
|
|
1341
|
+
raw: result,
|
|
1342
|
+
usage: toJsonObject(result.totalUsage ?? result.usage)
|
|
1043
1343
|
};
|
|
1044
1344
|
}
|
|
1045
1345
|
function toJsonObject(value) {
|
|
@@ -1052,34 +1352,59 @@ function toJsonObject(value) {
|
|
|
1052
1352
|
return void 0;
|
|
1053
1353
|
}
|
|
1054
1354
|
}
|
|
1055
|
-
function
|
|
1056
|
-
if (typeof
|
|
1057
|
-
|
|
1355
|
+
function extractStatus(error) {
|
|
1356
|
+
if (!error || typeof error !== "object") {
|
|
1357
|
+
return void 0;
|
|
1358
|
+
}
|
|
1359
|
+
const candidate = error;
|
|
1360
|
+
const directStatus = candidate.status ?? candidate.statusCode;
|
|
1361
|
+
if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
|
|
1362
|
+
return directStatus;
|
|
1058
1363
|
}
|
|
1059
|
-
|
|
1060
|
-
|
|
1364
|
+
const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
|
|
1365
|
+
if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
|
|
1366
|
+
return responseStatus;
|
|
1367
|
+
}
|
|
1368
|
+
const message = typeof candidate.message === "string" ? candidate.message : void 0;
|
|
1369
|
+
if (message) {
|
|
1370
|
+
const match = message.match(/HTTP\s+(\d{3})/i);
|
|
1371
|
+
if (match) {
|
|
1372
|
+
const parsed = Number.parseInt(match[1], 10);
|
|
1373
|
+
if (Number.isFinite(parsed)) {
|
|
1374
|
+
return parsed;
|
|
1375
|
+
}
|
|
1376
|
+
}
|
|
1061
1377
|
}
|
|
1062
|
-
return
|
|
1378
|
+
return void 0;
|
|
1063
1379
|
}
|
|
1064
|
-
function
|
|
1380
|
+
function isNetworkError(error) {
|
|
1065
1381
|
if (!error || typeof error !== "object") {
|
|
1066
1382
|
return false;
|
|
1067
1383
|
}
|
|
1068
|
-
|
|
1069
|
-
|
|
1384
|
+
const candidate = error;
|
|
1385
|
+
if (candidate.name === "AbortError") {
|
|
1386
|
+
return false;
|
|
1070
1387
|
}
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
const status = Number.parseInt(match[1], 10);
|
|
1075
|
-
return retryableStatusCodes.includes(status);
|
|
1076
|
-
}
|
|
1388
|
+
const code = candidate.code;
|
|
1389
|
+
if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
|
|
1390
|
+
return true;
|
|
1077
1391
|
}
|
|
1078
|
-
|
|
1392
|
+
const message = typeof candidate.message === "string" ? candidate.message : void 0;
|
|
1393
|
+
if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
|
|
1079
1394
|
return true;
|
|
1080
1395
|
}
|
|
1081
1396
|
return false;
|
|
1082
1397
|
}
|
|
1398
|
+
function isRetryableError(error, retryableStatusCodes) {
|
|
1399
|
+
const status = extractStatus(error);
|
|
1400
|
+
if (status === 401 || status === 403) {
|
|
1401
|
+
return false;
|
|
1402
|
+
}
|
|
1403
|
+
if (typeof status === "number") {
|
|
1404
|
+
return retryableStatusCodes.includes(status);
|
|
1405
|
+
}
|
|
1406
|
+
return isNetworkError(error);
|
|
1407
|
+
}
|
|
1083
1408
|
function calculateRetryDelay(attempt, config) {
|
|
1084
1409
|
const delay = Math.min(
|
|
1085
1410
|
config.maxDelayMs,
|
|
@@ -1115,152 +1440,16 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
1115
1440
|
}
|
|
1116
1441
|
const delay = calculateRetryDelay(attempt, config);
|
|
1117
1442
|
await sleep(delay);
|
|
1118
|
-
if (signal?.aborted) {
|
|
1119
|
-
throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
|
|
1120
|
-
}
|
|
1121
1443
|
}
|
|
1122
1444
|
}
|
|
1123
1445
|
throw lastError;
|
|
1124
1446
|
}
|
|
1125
|
-
var AzureProvider = class {
|
|
1126
|
-
constructor(targetName, config) {
|
|
1127
|
-
this.config = config;
|
|
1128
|
-
this.id = `azure:${targetName}`;
|
|
1129
|
-
this.targetName = targetName;
|
|
1130
|
-
this.defaults = {
|
|
1131
|
-
temperature: config.temperature,
|
|
1132
|
-
maxOutputTokens: config.maxOutputTokens
|
|
1133
|
-
};
|
|
1134
|
-
this.retryConfig = config.retry;
|
|
1135
|
-
this.ai = import_ax.AxAI.create({
|
|
1136
|
-
name: "azure-openai",
|
|
1137
|
-
apiKey: config.apiKey,
|
|
1138
|
-
resourceName: config.resourceName,
|
|
1139
|
-
deploymentName: config.deploymentName,
|
|
1140
|
-
version: config.version,
|
|
1141
|
-
config: {
|
|
1142
|
-
stream: false
|
|
1143
|
-
}
|
|
1144
|
-
});
|
|
1145
|
-
}
|
|
1146
|
-
id;
|
|
1147
|
-
kind = "azure";
|
|
1148
|
-
targetName;
|
|
1149
|
-
ai;
|
|
1150
|
-
defaults;
|
|
1151
|
-
retryConfig;
|
|
1152
|
-
async invoke(request) {
|
|
1153
|
-
const chatPrompt = buildChatPrompt(request);
|
|
1154
|
-
const modelConfig = extractModelConfig(request, this.defaults);
|
|
1155
|
-
const response = await withRetry(
|
|
1156
|
-
async () => await this.ai.chat(
|
|
1157
|
-
{
|
|
1158
|
-
chatPrompt,
|
|
1159
|
-
model: this.config.deploymentName,
|
|
1160
|
-
...modelConfig ? { modelConfig } : {}
|
|
1161
|
-
},
|
|
1162
|
-
request.signal ? { abortSignal: request.signal } : void 0
|
|
1163
|
-
),
|
|
1164
|
-
this.retryConfig,
|
|
1165
|
-
request.signal
|
|
1166
|
-
);
|
|
1167
|
-
return mapResponse(ensureChatResponse(response));
|
|
1168
|
-
}
|
|
1169
|
-
getAxAI() {
|
|
1170
|
-
return this.ai;
|
|
1171
|
-
}
|
|
1172
|
-
};
|
|
1173
|
-
var AnthropicProvider = class {
|
|
1174
|
-
constructor(targetName, config) {
|
|
1175
|
-
this.config = config;
|
|
1176
|
-
this.id = `anthropic:${targetName}`;
|
|
1177
|
-
this.targetName = targetName;
|
|
1178
|
-
this.defaults = {
|
|
1179
|
-
temperature: config.temperature,
|
|
1180
|
-
maxOutputTokens: config.maxOutputTokens,
|
|
1181
|
-
thinkingBudget: config.thinkingBudget
|
|
1182
|
-
};
|
|
1183
|
-
this.retryConfig = config.retry;
|
|
1184
|
-
this.ai = import_ax.AxAI.create({
|
|
1185
|
-
name: "anthropic",
|
|
1186
|
-
apiKey: config.apiKey
|
|
1187
|
-
});
|
|
1188
|
-
}
|
|
1189
|
-
id;
|
|
1190
|
-
kind = "anthropic";
|
|
1191
|
-
targetName;
|
|
1192
|
-
ai;
|
|
1193
|
-
defaults;
|
|
1194
|
-
retryConfig;
|
|
1195
|
-
async invoke(request) {
|
|
1196
|
-
const chatPrompt = buildChatPrompt(request);
|
|
1197
|
-
const modelConfig = extractModelConfig(request, this.defaults);
|
|
1198
|
-
const response = await withRetry(
|
|
1199
|
-
async () => await this.ai.chat(
|
|
1200
|
-
{
|
|
1201
|
-
chatPrompt,
|
|
1202
|
-
model: this.config.model,
|
|
1203
|
-
...modelConfig ? { modelConfig } : {}
|
|
1204
|
-
},
|
|
1205
|
-
request.signal ? { abortSignal: request.signal } : void 0
|
|
1206
|
-
),
|
|
1207
|
-
this.retryConfig,
|
|
1208
|
-
request.signal
|
|
1209
|
-
);
|
|
1210
|
-
return mapResponse(ensureChatResponse(response));
|
|
1211
|
-
}
|
|
1212
|
-
getAxAI() {
|
|
1213
|
-
return this.ai;
|
|
1214
|
-
}
|
|
1215
|
-
};
|
|
1216
|
-
var GeminiProvider = class {
|
|
1217
|
-
constructor(targetName, config) {
|
|
1218
|
-
this.config = config;
|
|
1219
|
-
this.id = `gemini:${targetName}`;
|
|
1220
|
-
this.targetName = targetName;
|
|
1221
|
-
this.defaults = {
|
|
1222
|
-
temperature: config.temperature,
|
|
1223
|
-
maxOutputTokens: config.maxOutputTokens
|
|
1224
|
-
};
|
|
1225
|
-
this.retryConfig = config.retry;
|
|
1226
|
-
this.ai = import_ax.AxAI.create({
|
|
1227
|
-
name: "google-gemini",
|
|
1228
|
-
apiKey: config.apiKey
|
|
1229
|
-
});
|
|
1230
|
-
}
|
|
1231
|
-
id;
|
|
1232
|
-
kind = "gemini";
|
|
1233
|
-
targetName;
|
|
1234
|
-
ai;
|
|
1235
|
-
defaults;
|
|
1236
|
-
retryConfig;
|
|
1237
|
-
async invoke(request) {
|
|
1238
|
-
const chatPrompt = buildChatPrompt(request);
|
|
1239
|
-
const modelConfig = extractModelConfig(request, this.defaults);
|
|
1240
|
-
const response = await withRetry(
|
|
1241
|
-
async () => await this.ai.chat(
|
|
1242
|
-
{
|
|
1243
|
-
chatPrompt,
|
|
1244
|
-
model: this.config.model,
|
|
1245
|
-
...modelConfig ? { modelConfig } : {}
|
|
1246
|
-
},
|
|
1247
|
-
request.signal ? { abortSignal: request.signal } : void 0
|
|
1248
|
-
),
|
|
1249
|
-
this.retryConfig,
|
|
1250
|
-
request.signal
|
|
1251
|
-
);
|
|
1252
|
-
return mapResponse(ensureChatResponse(response));
|
|
1253
|
-
}
|
|
1254
|
-
getAxAI() {
|
|
1255
|
-
return this.ai;
|
|
1256
|
-
}
|
|
1257
|
-
};
|
|
1258
1447
|
|
|
1259
1448
|
// src/evaluation/providers/cli.ts
|
|
1260
1449
|
var import_node_child_process = require("child_process");
|
|
1261
|
-
var
|
|
1450
|
+
var import_promises7 = __toESM(require("fs/promises"), 1);
|
|
1262
1451
|
var import_node_os = __toESM(require("os"), 1);
|
|
1263
|
-
var
|
|
1452
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
1264
1453
|
var import_node_util = require("util");
|
|
1265
1454
|
var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
|
|
1266
1455
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
@@ -1302,12 +1491,14 @@ var CliProvider = class {
|
|
|
1302
1491
|
supportsBatch = false;
|
|
1303
1492
|
config;
|
|
1304
1493
|
runCommand;
|
|
1494
|
+
verbose;
|
|
1305
1495
|
healthcheckPromise;
|
|
1306
1496
|
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1307
1497
|
this.targetName = targetName;
|
|
1308
1498
|
this.id = `cli:${targetName}`;
|
|
1309
1499
|
this.config = config;
|
|
1310
1500
|
this.runCommand = runner;
|
|
1501
|
+
this.verbose = config.verbose ?? false;
|
|
1311
1502
|
}
|
|
1312
1503
|
async invoke(request) {
|
|
1313
1504
|
if (request.signal?.aborted) {
|
|
@@ -1357,7 +1548,7 @@ var CliProvider = class {
|
|
|
1357
1548
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1358
1549
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
1359
1550
|
} finally {
|
|
1360
|
-
await
|
|
1551
|
+
await import_promises7.default.unlink(filePath).catch(() => {
|
|
1361
1552
|
});
|
|
1362
1553
|
}
|
|
1363
1554
|
}
|
|
@@ -1408,6 +1599,11 @@ var CliProvider = class {
|
|
|
1408
1599
|
generateOutputFilePath("healthcheck")
|
|
1409
1600
|
)
|
|
1410
1601
|
);
|
|
1602
|
+
if (this.verbose) {
|
|
1603
|
+
console.log(
|
|
1604
|
+
`[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1605
|
+
);
|
|
1606
|
+
}
|
|
1411
1607
|
const result = await this.runCommand(renderedCommand, {
|
|
1412
1608
|
cwd: healthcheck.cwd ?? this.config.cwd,
|
|
1413
1609
|
env: process.env,
|
|
@@ -1439,7 +1635,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
1439
1635
|
}
|
|
1440
1636
|
const unique = /* @__PURE__ */ new Map();
|
|
1441
1637
|
for (const inputFile of inputFiles) {
|
|
1442
|
-
const absolutePath =
|
|
1638
|
+
const absolutePath = import_node_path8.default.resolve(inputFile);
|
|
1443
1639
|
if (!unique.has(absolutePath)) {
|
|
1444
1640
|
unique.set(absolutePath, absolutePath);
|
|
1445
1641
|
}
|
|
@@ -1453,7 +1649,7 @@ function formatFileList(files, template) {
|
|
|
1453
1649
|
const formatter = template ?? "{path}";
|
|
1454
1650
|
return files.map((filePath) => {
|
|
1455
1651
|
const escapedPath = shellEscape(filePath);
|
|
1456
|
-
const escapedName = shellEscape(
|
|
1652
|
+
const escapedName = shellEscape(import_node_path8.default.basename(filePath));
|
|
1457
1653
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
1458
1654
|
}).join(" ");
|
|
1459
1655
|
}
|
|
@@ -1477,7 +1673,7 @@ function generateOutputFilePath(evalCaseId) {
|
|
|
1477
1673
|
const safeEvalId = evalCaseId || "unknown";
|
|
1478
1674
|
const timestamp = Date.now();
|
|
1479
1675
|
const random = Math.random().toString(36).substring(2, 9);
|
|
1480
|
-
return
|
|
1676
|
+
return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
|
|
1481
1677
|
}
|
|
1482
1678
|
function formatTimeoutSuffix(timeoutMs) {
|
|
1483
1679
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -1491,9 +1687,9 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
1491
1687
|
var import_node_child_process2 = require("child_process");
|
|
1492
1688
|
var import_node_crypto = require("crypto");
|
|
1493
1689
|
var import_node_fs3 = require("fs");
|
|
1494
|
-
var
|
|
1690
|
+
var import_promises8 = require("fs/promises");
|
|
1495
1691
|
var import_node_os2 = require("os");
|
|
1496
|
-
var
|
|
1692
|
+
var import_node_path10 = __toESM(require("path"), 1);
|
|
1497
1693
|
var import_node_util2 = require("util");
|
|
1498
1694
|
|
|
1499
1695
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -1550,7 +1746,7 @@ function subscribeToCodexLogEntries(listener) {
|
|
|
1550
1746
|
}
|
|
1551
1747
|
|
|
1552
1748
|
// src/evaluation/providers/preread.ts
|
|
1553
|
-
var
|
|
1749
|
+
var import_node_path9 = __toESM(require("path"), 1);
|
|
1554
1750
|
function buildPromptDocument(request, inputFiles, options) {
|
|
1555
1751
|
const parts = [];
|
|
1556
1752
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -1575,7 +1771,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
1575
1771
|
}
|
|
1576
1772
|
const deduped = /* @__PURE__ */ new Map();
|
|
1577
1773
|
for (const inputFile of inputFiles) {
|
|
1578
|
-
const absolutePath =
|
|
1774
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
1579
1775
|
if (!deduped.has(absolutePath)) {
|
|
1580
1776
|
deduped.set(absolutePath, absolutePath);
|
|
1581
1777
|
}
|
|
@@ -1588,14 +1784,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
1588
1784
|
}
|
|
1589
1785
|
const unique = /* @__PURE__ */ new Map();
|
|
1590
1786
|
for (const inputFile of inputFiles) {
|
|
1591
|
-
const absolutePath =
|
|
1787
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
1592
1788
|
if (overrides?.has(absolutePath)) {
|
|
1593
1789
|
if (!unique.has(absolutePath)) {
|
|
1594
1790
|
unique.set(absolutePath, absolutePath);
|
|
1595
1791
|
}
|
|
1596
1792
|
continue;
|
|
1597
1793
|
}
|
|
1598
|
-
const normalized = absolutePath.split(
|
|
1794
|
+
const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
|
|
1599
1795
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1600
1796
|
if (!unique.has(absolutePath)) {
|
|
1601
1797
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1610,7 +1806,7 @@ function collectInputFiles(inputFiles) {
|
|
|
1610
1806
|
}
|
|
1611
1807
|
const unique = /* @__PURE__ */ new Map();
|
|
1612
1808
|
for (const inputFile of inputFiles) {
|
|
1613
|
-
const absolutePath =
|
|
1809
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
1614
1810
|
if (!unique.has(absolutePath)) {
|
|
1615
1811
|
unique.set(absolutePath, absolutePath);
|
|
1616
1812
|
}
|
|
@@ -1622,7 +1818,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
1622
1818
|
return "";
|
|
1623
1819
|
}
|
|
1624
1820
|
const buildList = (files) => files.map((absolutePath) => {
|
|
1625
|
-
const fileName =
|
|
1821
|
+
const fileName = import_node_path9.default.basename(absolutePath);
|
|
1626
1822
|
const fileUri = pathToFileUri(absolutePath);
|
|
1627
1823
|
return `* [${fileName}](${fileUri})`;
|
|
1628
1824
|
});
|
|
@@ -1642,7 +1838,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
1642
1838
|
return sections.join("\n");
|
|
1643
1839
|
}
|
|
1644
1840
|
function pathToFileUri(filePath) {
|
|
1645
|
-
const absolutePath =
|
|
1841
|
+
const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
|
|
1646
1842
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1647
1843
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1648
1844
|
return `file:///${normalizedPath}`;
|
|
@@ -1680,8 +1876,8 @@ var CodexProvider = class {
|
|
|
1680
1876
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1681
1877
|
try {
|
|
1682
1878
|
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1683
|
-
const promptFile =
|
|
1684
|
-
await (0,
|
|
1879
|
+
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
1880
|
+
await (0, import_promises8.writeFile)(promptFile, promptContent, "utf8");
|
|
1685
1881
|
const args = this.buildCodexArgs();
|
|
1686
1882
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
1687
1883
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
@@ -1730,7 +1926,7 @@ var CodexProvider = class {
|
|
|
1730
1926
|
if (!this.config.cwd) {
|
|
1731
1927
|
return workspaceRoot;
|
|
1732
1928
|
}
|
|
1733
|
-
return
|
|
1929
|
+
return import_node_path10.default.resolve(this.config.cwd);
|
|
1734
1930
|
}
|
|
1735
1931
|
buildCodexArgs() {
|
|
1736
1932
|
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
@@ -1764,11 +1960,11 @@ var CodexProvider = class {
|
|
|
1764
1960
|
}
|
|
1765
1961
|
}
|
|
1766
1962
|
async createWorkspace() {
|
|
1767
|
-
return await (0,
|
|
1963
|
+
return await (0, import_promises8.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
|
|
1768
1964
|
}
|
|
1769
1965
|
async cleanupWorkspace(workspaceRoot) {
|
|
1770
1966
|
try {
|
|
1771
|
-
await (0,
|
|
1967
|
+
await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
|
|
1772
1968
|
} catch {
|
|
1773
1969
|
}
|
|
1774
1970
|
}
|
|
@@ -1778,9 +1974,9 @@ var CodexProvider = class {
|
|
|
1778
1974
|
return void 0;
|
|
1779
1975
|
}
|
|
1780
1976
|
if (this.config.logDir) {
|
|
1781
|
-
return
|
|
1977
|
+
return import_node_path10.default.resolve(this.config.logDir);
|
|
1782
1978
|
}
|
|
1783
|
-
return
|
|
1979
|
+
return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
|
|
1784
1980
|
}
|
|
1785
1981
|
async createStreamLogger(request) {
|
|
1786
1982
|
const logDir = this.resolveLogDirectory();
|
|
@@ -1788,13 +1984,13 @@ var CodexProvider = class {
|
|
|
1788
1984
|
return void 0;
|
|
1789
1985
|
}
|
|
1790
1986
|
try {
|
|
1791
|
-
await (0,
|
|
1987
|
+
await (0, import_promises8.mkdir)(logDir, { recursive: true });
|
|
1792
1988
|
} catch (error) {
|
|
1793
1989
|
const message = error instanceof Error ? error.message : String(error);
|
|
1794
1990
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
1795
1991
|
return void 0;
|
|
1796
1992
|
}
|
|
1797
|
-
const filePath =
|
|
1993
|
+
const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
|
|
1798
1994
|
try {
|
|
1799
1995
|
const logger = await CodexStreamLogger.create({
|
|
1800
1996
|
filePath,
|
|
@@ -2009,9 +2205,9 @@ function tryParseJsonValue(rawLine) {
|
|
|
2009
2205
|
async function locateExecutable(candidate) {
|
|
2010
2206
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
2011
2207
|
if (includesPathSeparator) {
|
|
2012
|
-
const resolved =
|
|
2208
|
+
const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
|
|
2013
2209
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
2014
|
-
await (0,
|
|
2210
|
+
await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2015
2211
|
return executablePath;
|
|
2016
2212
|
}
|
|
2017
2213
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -2021,7 +2217,7 @@ async function locateExecutable(candidate) {
|
|
|
2021
2217
|
const preferred = selectExecutableCandidate(lines);
|
|
2022
2218
|
if (preferred) {
|
|
2023
2219
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
2024
|
-
await (0,
|
|
2220
|
+
await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2025
2221
|
return executablePath;
|
|
2026
2222
|
}
|
|
2027
2223
|
} catch {
|
|
@@ -2055,7 +2251,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
2055
2251
|
for (const ext of extensions) {
|
|
2056
2252
|
const withExtension = `${candidate}${ext}`;
|
|
2057
2253
|
try {
|
|
2058
|
-
await (0,
|
|
2254
|
+
await (0, import_promises8.access)(withExtension, import_node_fs3.constants.F_OK);
|
|
2059
2255
|
return withExtension;
|
|
2060
2256
|
} catch {
|
|
2061
2257
|
}
|
|
@@ -2867,7 +3063,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
2867
3063
|
}
|
|
2868
3064
|
|
|
2869
3065
|
// src/evaluation/providers/vscode.ts
|
|
2870
|
-
var
|
|
3066
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2871
3067
|
var import_subagent = require("subagent");
|
|
2872
3068
|
var VSCodeProvider = class {
|
|
2873
3069
|
id;
|
|
@@ -2980,6 +3176,9 @@ var VSCodeProvider = class {
|
|
|
2980
3176
|
};
|
|
2981
3177
|
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
2982
3178
|
const parts = [];
|
|
3179
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
3180
|
+
parts.push(request.systemPrompt.trim());
|
|
3181
|
+
}
|
|
2983
3182
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
2984
3183
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
2985
3184
|
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
@@ -2997,7 +3196,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
2997
3196
|
return "";
|
|
2998
3197
|
}
|
|
2999
3198
|
const buildList = (files) => files.map((absolutePath) => {
|
|
3000
|
-
const fileName =
|
|
3199
|
+
const fileName = import_node_path11.default.basename(absolutePath);
|
|
3001
3200
|
const fileUri = pathToFileUri2(absolutePath);
|
|
3002
3201
|
return `* [${fileName}](${fileUri})`;
|
|
3003
3202
|
});
|
|
@@ -3022,8 +3221,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3022
3221
|
}
|
|
3023
3222
|
const unique = /* @__PURE__ */ new Map();
|
|
3024
3223
|
for (const attachment of attachments) {
|
|
3025
|
-
const absolutePath =
|
|
3026
|
-
const normalized = absolutePath.split(
|
|
3224
|
+
const absolutePath = import_node_path11.default.resolve(attachment);
|
|
3225
|
+
const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
|
|
3027
3226
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3028
3227
|
if (!unique.has(absolutePath)) {
|
|
3029
3228
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3038,7 +3237,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3038
3237
|
}
|
|
3039
3238
|
const unique = /* @__PURE__ */ new Map();
|
|
3040
3239
|
for (const attachment of attachments) {
|
|
3041
|
-
const absolutePath =
|
|
3240
|
+
const absolutePath = import_node_path11.default.resolve(attachment);
|
|
3042
3241
|
if (!unique.has(absolutePath)) {
|
|
3043
3242
|
unique.set(absolutePath, absolutePath);
|
|
3044
3243
|
}
|
|
@@ -3046,7 +3245,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3046
3245
|
return Array.from(unique.values());
|
|
3047
3246
|
}
|
|
3048
3247
|
function pathToFileUri2(filePath) {
|
|
3049
|
-
const absolutePath =
|
|
3248
|
+
const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
|
|
3050
3249
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3051
3250
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3052
3251
|
return `file:///${normalizedPath}`;
|
|
@@ -3059,7 +3258,7 @@ function normalizeAttachments(attachments) {
|
|
|
3059
3258
|
}
|
|
3060
3259
|
const deduped = /* @__PURE__ */ new Set();
|
|
3061
3260
|
for (const attachment of attachments) {
|
|
3062
|
-
deduped.add(
|
|
3261
|
+
deduped.add(import_node_path11.default.resolve(attachment));
|
|
3063
3262
|
}
|
|
3064
3263
|
return Array.from(deduped);
|
|
3065
3264
|
}
|
|
@@ -3068,7 +3267,7 @@ function mergeAttachments(all) {
|
|
|
3068
3267
|
for (const list of all) {
|
|
3069
3268
|
if (!list) continue;
|
|
3070
3269
|
for (const inputFile of list) {
|
|
3071
|
-
deduped.add(
|
|
3270
|
+
deduped.add(import_node_path11.default.resolve(inputFile));
|
|
3072
3271
|
}
|
|
3073
3272
|
}
|
|
3074
3273
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -3114,9 +3313,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3114
3313
|
|
|
3115
3314
|
// src/evaluation/providers/targets-file.ts
|
|
3116
3315
|
var import_node_fs4 = require("fs");
|
|
3117
|
-
var
|
|
3118
|
-
var
|
|
3119
|
-
var
|
|
3316
|
+
var import_promises9 = require("fs/promises");
|
|
3317
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3318
|
+
var import_yaml3 = require("yaml");
|
|
3120
3319
|
|
|
3121
3320
|
// src/evaluation/providers/types.ts
|
|
3122
3321
|
var AGENT_PROVIDER_KINDS = [
|
|
@@ -3177,19 +3376,19 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
3177
3376
|
}
|
|
3178
3377
|
async function fileExists3(filePath) {
|
|
3179
3378
|
try {
|
|
3180
|
-
await (0,
|
|
3379
|
+
await (0, import_promises9.access)(filePath, import_node_fs4.constants.F_OK);
|
|
3181
3380
|
return true;
|
|
3182
3381
|
} catch {
|
|
3183
3382
|
return false;
|
|
3184
3383
|
}
|
|
3185
3384
|
}
|
|
3186
3385
|
async function readTargetDefinitions(filePath) {
|
|
3187
|
-
const absolutePath =
|
|
3386
|
+
const absolutePath = import_node_path12.default.resolve(filePath);
|
|
3188
3387
|
if (!await fileExists3(absolutePath)) {
|
|
3189
3388
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3190
3389
|
}
|
|
3191
|
-
const raw = await (0,
|
|
3192
|
-
const parsed = (0,
|
|
3390
|
+
const raw = await (0, import_promises9.readFile)(absolutePath, "utf8");
|
|
3391
|
+
const parsed = (0, import_yaml3.parse)(raw);
|
|
3193
3392
|
if (!isRecord(parsed)) {
|
|
3194
3393
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
3195
3394
|
}
|
|
@@ -3232,18 +3431,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
3232
3431
|
}
|
|
3233
3432
|
|
|
3234
3433
|
// src/evaluation/evaluators.ts
|
|
3235
|
-
var
|
|
3434
|
+
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3435
|
+
|
|
3436
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3437
|
+
|
|
3438
|
+
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
3439
|
+
|
|
3440
|
+
[[ ## expected_outcome ## ]]
|
|
3441
|
+
{{expected_outcome}}
|
|
3442
|
+
|
|
3443
|
+
[[ ## question ## ]]
|
|
3444
|
+
{{question}}
|
|
3445
|
+
|
|
3446
|
+
[[ ## reference_answer ## ]]
|
|
3447
|
+
{{reference_answer}}
|
|
3448
|
+
|
|
3449
|
+
[[ ## candidate_answer ## ]]
|
|
3450
|
+
{{candidate_answer}}`;
|
|
3236
3451
|
var LlmJudgeEvaluator = class {
|
|
3237
3452
|
kind = "llm_judge";
|
|
3238
3453
|
resolveJudgeProvider;
|
|
3239
3454
|
maxOutputTokens;
|
|
3240
3455
|
temperature;
|
|
3241
|
-
|
|
3456
|
+
evaluatorTemplate;
|
|
3242
3457
|
constructor(options) {
|
|
3243
3458
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
3244
3459
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
3245
3460
|
this.temperature = options.temperature;
|
|
3246
|
-
this.
|
|
3461
|
+
this.evaluatorTemplate = options.evaluatorTemplate;
|
|
3247
3462
|
}
|
|
3248
3463
|
async evaluate(context) {
|
|
3249
3464
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
@@ -3253,26 +3468,21 @@ var LlmJudgeEvaluator = class {
|
|
|
3253
3468
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
3254
3469
|
}
|
|
3255
3470
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
3256
|
-
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
3257
3471
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
prompt = substituteVariables(systemPrompt, variables);
|
|
3270
|
-
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
3271
|
-
}
|
|
3272
|
-
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
3472
|
+
const variables = {
|
|
3473
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3474
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
3475
|
+
candidate_answer: context.candidate.trim(),
|
|
3476
|
+
reference_answer: (context.evalCase.reference_answer ?? "").trim(),
|
|
3477
|
+
expected_outcome: context.evalCase.expected_outcome.trim(),
|
|
3478
|
+
question: formattedQuestion.trim()
|
|
3479
|
+
};
|
|
3480
|
+
const systemPrompt = buildOutputSchema();
|
|
3481
|
+
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
3482
|
+
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
3273
3483
|
const response = await judgeProvider.invoke({
|
|
3274
|
-
question:
|
|
3275
|
-
|
|
3484
|
+
question: userPrompt,
|
|
3485
|
+
systemPrompt,
|
|
3276
3486
|
evalCaseId: context.evalCase.id,
|
|
3277
3487
|
attempt: context.attempt,
|
|
3278
3488
|
maxOutputTokens: this.maxOutputTokens,
|
|
@@ -3285,11 +3495,9 @@ var LlmJudgeEvaluator = class {
|
|
|
3285
3495
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
3286
3496
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3287
3497
|
const evaluatorRawRequest = {
|
|
3288
|
-
|
|
3289
|
-
|
|
3290
|
-
|
|
3291
|
-
target: context.target.name,
|
|
3292
|
-
...systemPrompt !== void 0 && { systemPrompt }
|
|
3498
|
+
userPrompt,
|
|
3499
|
+
systemPrompt,
|
|
3500
|
+
target: judgeProvider.targetName
|
|
3293
3501
|
};
|
|
3294
3502
|
return {
|
|
3295
3503
|
score,
|
|
@@ -3301,20 +3509,8 @@ var LlmJudgeEvaluator = class {
|
|
|
3301
3509
|
};
|
|
3302
3510
|
}
|
|
3303
3511
|
};
|
|
3304
|
-
function
|
|
3305
|
-
|
|
3306
|
-
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
3307
|
-
""
|
|
3308
|
-
];
|
|
3309
|
-
if (hasReferenceAnswer) {
|
|
3310
|
-
basePrompt.push(
|
|
3311
|
-
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
3312
|
-
""
|
|
3313
|
-
);
|
|
3314
|
-
}
|
|
3315
|
-
basePrompt.push(
|
|
3316
|
-
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
3317
|
-
"",
|
|
3512
|
+
function buildOutputSchema() {
|
|
3513
|
+
return [
|
|
3318
3514
|
"You must respond with a single JSON object matching this schema:",
|
|
3319
3515
|
"",
|
|
3320
3516
|
"{",
|
|
@@ -3323,30 +3519,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
|
|
|
3323
3519
|
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
3324
3520
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
3325
3521
|
"}"
|
|
3326
|
-
);
|
|
3327
|
-
return basePrompt.join("\n");
|
|
3328
|
-
}
|
|
3329
|
-
function buildQualityPrompt(evalCase, candidate, question) {
|
|
3330
|
-
const parts = [
|
|
3331
|
-
"[[ ## expected_outcome ## ]]",
|
|
3332
|
-
evalCase.expected_outcome.trim(),
|
|
3333
|
-
"",
|
|
3334
|
-
"[[ ## question ## ]]",
|
|
3335
|
-
question.trim(),
|
|
3336
|
-
""
|
|
3337
|
-
];
|
|
3338
|
-
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
3339
|
-
parts.push(
|
|
3340
|
-
"[[ ## reference_answer ## ]]",
|
|
3341
|
-
evalCase.reference_answer.trim(),
|
|
3342
|
-
""
|
|
3343
|
-
);
|
|
3344
|
-
}
|
|
3345
|
-
parts.push(
|
|
3346
|
-
"[[ ## candidate_answer ## ]]",
|
|
3347
|
-
candidate.trim()
|
|
3348
|
-
);
|
|
3349
|
-
return parts.join("\n");
|
|
3522
|
+
].join("\n");
|
|
3350
3523
|
}
|
|
3351
3524
|
function clampScore(value) {
|
|
3352
3525
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -3428,9 +3601,6 @@ function extractJsonBlob(text) {
|
|
|
3428
3601
|
function isNonEmptyString(value) {
|
|
3429
3602
|
return typeof value === "string" && value.trim().length > 0;
|
|
3430
3603
|
}
|
|
3431
|
-
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
3432
|
-
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
3433
|
-
}
|
|
3434
3604
|
var CodeEvaluator = class {
|
|
3435
3605
|
kind = "code";
|
|
3436
3606
|
script;
|
|
@@ -3536,19 +3706,16 @@ function parseJsonSafe(payload) {
|
|
|
3536
3706
|
return void 0;
|
|
3537
3707
|
}
|
|
3538
3708
|
}
|
|
3539
|
-
function hasTemplateVariables(text) {
|
|
3540
|
-
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
3541
|
-
}
|
|
3542
3709
|
function substituteVariables(template, variables) {
|
|
3543
|
-
return template.replace(
|
|
3710
|
+
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
3544
3711
|
return variables[varName] ?? match;
|
|
3545
3712
|
});
|
|
3546
3713
|
}
|
|
3547
3714
|
|
|
3548
3715
|
// src/evaluation/orchestrator.ts
|
|
3549
|
-
var
|
|
3550
|
-
var
|
|
3551
|
-
var
|
|
3716
|
+
var import_node_crypto2 = require("crypto");
|
|
3717
|
+
var import_promises10 = require("fs/promises");
|
|
3718
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
3552
3719
|
|
|
3553
3720
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
3554
3721
|
var Node = class {
|
|
@@ -4111,6 +4278,7 @@ async function evaluateCandidate(options) {
|
|
|
4111
4278
|
}
|
|
4112
4279
|
}
|
|
4113
4280
|
return {
|
|
4281
|
+
timestamp: completedAt.toISOString(),
|
|
4114
4282
|
eval_id: evalCase.id,
|
|
4115
4283
|
dataset: evalCase.dataset,
|
|
4116
4284
|
conversation_id: evalCase.conversation_id,
|
|
@@ -4118,14 +4286,12 @@ async function evaluateCandidate(options) {
|
|
|
4118
4286
|
hits: score.hits,
|
|
4119
4287
|
misses: score.misses,
|
|
4120
4288
|
candidate_answer: candidate,
|
|
4121
|
-
expected_aspect_count: score.expectedAspectCount,
|
|
4122
4289
|
target: target.name,
|
|
4123
|
-
timestamp: completedAt.toISOString(),
|
|
4124
4290
|
reasoning: score.reasoning,
|
|
4125
4291
|
raw_aspects: score.rawAspects,
|
|
4126
4292
|
agent_provider_request: agentProviderRequest,
|
|
4127
4293
|
lm_provider_request: lmProviderRequest,
|
|
4128
|
-
|
|
4294
|
+
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4129
4295
|
evaluator_results: evaluatorResults
|
|
4130
4296
|
};
|
|
4131
4297
|
}
|
|
@@ -4202,7 +4368,7 @@ async function runEvaluatorList(options) {
|
|
|
4202
4368
|
hits: score2.hits,
|
|
4203
4369
|
misses: score2.misses,
|
|
4204
4370
|
reasoning: score2.reasoning,
|
|
4205
|
-
|
|
4371
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4206
4372
|
});
|
|
4207
4373
|
continue;
|
|
4208
4374
|
}
|
|
@@ -4229,7 +4395,7 @@ async function runEvaluatorList(options) {
|
|
|
4229
4395
|
hits: score2.hits,
|
|
4230
4396
|
misses: score2.misses,
|
|
4231
4397
|
reasoning: score2.reasoning,
|
|
4232
|
-
|
|
4398
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4233
4399
|
});
|
|
4234
4400
|
continue;
|
|
4235
4401
|
}
|
|
@@ -4282,7 +4448,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
4282
4448
|
promptInputs,
|
|
4283
4449
|
now,
|
|
4284
4450
|
judgeProvider,
|
|
4285
|
-
|
|
4451
|
+
evaluatorTemplateOverride: customPrompt,
|
|
4286
4452
|
evaluator: config
|
|
4287
4453
|
});
|
|
4288
4454
|
}
|
|
@@ -4323,22 +4489,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
4323
4489
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
4324
4490
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
4325
4491
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
4326
|
-
const filePath =
|
|
4327
|
-
await (0,
|
|
4492
|
+
const filePath = import_node_path13.default.resolve(directory, filename);
|
|
4493
|
+
await (0, import_promises10.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
|
|
4328
4494
|
const payload = {
|
|
4329
4495
|
eval_id: evalCase.id,
|
|
4330
4496
|
question: promptInputs.question,
|
|
4331
4497
|
guidelines: promptInputs.guidelines,
|
|
4332
4498
|
guideline_paths: evalCase.guideline_paths
|
|
4333
4499
|
};
|
|
4334
|
-
await (0,
|
|
4500
|
+
await (0, import_promises10.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
4335
4501
|
}
|
|
4336
4502
|
function sanitizeFilename(value) {
|
|
4337
4503
|
if (!value) {
|
|
4338
4504
|
return "prompt";
|
|
4339
4505
|
}
|
|
4340
4506
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
4341
|
-
return sanitized.length > 0 ? sanitized : (0,
|
|
4507
|
+
return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
|
|
4342
4508
|
}
|
|
4343
4509
|
async function invokeProvider(provider, options) {
|
|
4344
4510
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -4394,6 +4560,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4394
4560
|
}
|
|
4395
4561
|
}
|
|
4396
4562
|
return {
|
|
4563
|
+
timestamp: timestamp.toISOString(),
|
|
4397
4564
|
eval_id: evalCase.id,
|
|
4398
4565
|
dataset: evalCase.dataset,
|
|
4399
4566
|
conversation_id: evalCase.conversation_id,
|
|
@@ -4401,9 +4568,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4401
4568
|
hits: [],
|
|
4402
4569
|
misses: [`Error: ${message}`],
|
|
4403
4570
|
candidate_answer: `Error occurred: ${message}`,
|
|
4404
|
-
expected_aspect_count: 0,
|
|
4405
4571
|
target: targetName,
|
|
4406
|
-
timestamp: timestamp.toISOString(),
|
|
4407
4572
|
raw_aspects: [],
|
|
4408
4573
|
agent_provider_request: agentProviderRequest,
|
|
4409
4574
|
lm_provider_request: lmProviderRequest,
|
|
@@ -4411,7 +4576,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4411
4576
|
};
|
|
4412
4577
|
}
|
|
4413
4578
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
4414
|
-
const hash = (0,
|
|
4579
|
+
const hash = (0, import_node_crypto2.createHash)("sha256");
|
|
4415
4580
|
hash.update(provider.id);
|
|
4416
4581
|
hash.update(target.name);
|
|
4417
4582
|
hash.update(evalCase.id);
|