@agentv/core 0.7.0 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-L7I5UTJU.js → chunk-UQLHF3T7.js} +12 -3
- package/dist/chunk-UQLHF3T7.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +143 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +1 -1
- package/dist/evaluation/validation/index.d.ts +1 -1
- package/dist/evaluation/validation/index.js +143 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +79 -135
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -3
- package/dist/index.d.ts +3 -3
- package/dist/index.js +69 -132
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-L7I5UTJU.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -95,7 +95,7 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
95
95
|
};
|
|
96
96
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
97
97
|
/**
|
|
98
|
-
*
|
|
98
|
+
* Eval case definition sourced from AgentV specs.
|
|
99
99
|
*/
|
|
100
100
|
interface EvalCase {
|
|
101
101
|
readonly id: string;
|
|
@@ -104,7 +104,6 @@ interface EvalCase {
|
|
|
104
104
|
readonly question: string;
|
|
105
105
|
readonly input_segments: readonly JsonObject[];
|
|
106
106
|
readonly output_segments: readonly JsonObject[];
|
|
107
|
-
readonly system_message?: string;
|
|
108
107
|
readonly reference_answer: string;
|
|
109
108
|
readonly guideline_paths: readonly string[];
|
|
110
109
|
readonly guideline_patterns?: readonly string[];
|
|
@@ -115,7 +114,7 @@ interface EvalCase {
|
|
|
115
114
|
readonly evaluators?: readonly EvaluatorConfig[];
|
|
116
115
|
}
|
|
117
116
|
/**
|
|
118
|
-
* Evaluator scorecard for a single
|
|
117
|
+
* Evaluator scorecard for a single eval case run.
|
|
119
118
|
*/
|
|
120
119
|
interface EvaluationResult {
|
|
121
120
|
readonly eval_id: string;
|
|
@@ -159,6 +158,7 @@ declare function isGuidelineFile(filePath: string, patterns?: readonly string[])
|
|
|
159
158
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
160
159
|
type LoadOptions = {
|
|
161
160
|
readonly verbose?: boolean;
|
|
161
|
+
readonly evalId?: string;
|
|
162
162
|
};
|
|
163
163
|
/**
|
|
164
164
|
* Load eval cases from a AgentV YAML specification file.
|
package/dist/index.d.ts
CHANGED
|
@@ -95,7 +95,7 @@ type LlmJudgeEvaluatorConfig = {
|
|
|
95
95
|
};
|
|
96
96
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
97
97
|
/**
|
|
98
|
-
*
|
|
98
|
+
* Eval case definition sourced from AgentV specs.
|
|
99
99
|
*/
|
|
100
100
|
interface EvalCase {
|
|
101
101
|
readonly id: string;
|
|
@@ -104,7 +104,6 @@ interface EvalCase {
|
|
|
104
104
|
readonly question: string;
|
|
105
105
|
readonly input_segments: readonly JsonObject[];
|
|
106
106
|
readonly output_segments: readonly JsonObject[];
|
|
107
|
-
readonly system_message?: string;
|
|
108
107
|
readonly reference_answer: string;
|
|
109
108
|
readonly guideline_paths: readonly string[];
|
|
110
109
|
readonly guideline_patterns?: readonly string[];
|
|
@@ -115,7 +114,7 @@ interface EvalCase {
|
|
|
115
114
|
readonly evaluators?: readonly EvaluatorConfig[];
|
|
116
115
|
}
|
|
117
116
|
/**
|
|
118
|
-
* Evaluator scorecard for a single
|
|
117
|
+
* Evaluator scorecard for a single eval case run.
|
|
119
118
|
*/
|
|
120
119
|
interface EvaluationResult {
|
|
121
120
|
readonly eval_id: string;
|
|
@@ -159,6 +158,7 @@ declare function isGuidelineFile(filePath: string, patterns?: readonly string[])
|
|
|
159
158
|
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
160
159
|
type LoadOptions = {
|
|
161
160
|
readonly verbose?: boolean;
|
|
161
|
+
readonly evalId?: string;
|
|
162
162
|
};
|
|
163
163
|
/**
|
|
164
164
|
* Load eval cases from a AgentV YAML specification file.
|
package/dist/index.js
CHANGED
|
@@ -4,9 +4,10 @@ import {
|
|
|
4
4
|
buildSearchRoots,
|
|
5
5
|
fileExists,
|
|
6
6
|
findGitRoot,
|
|
7
|
+
isAgentProvider,
|
|
7
8
|
readTextFile,
|
|
8
9
|
resolveFileReference
|
|
9
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-UQLHF3T7.js";
|
|
10
11
|
|
|
11
12
|
// src/evaluation/types.ts
|
|
12
13
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -217,6 +218,7 @@ async function processMessages(options) {
|
|
|
217
218
|
}
|
|
218
219
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
219
220
|
const verbose = options?.verbose ?? false;
|
|
221
|
+
const evalIdFilter = options?.evalId;
|
|
220
222
|
const absoluteTestPath = path.resolve(evalFilePath);
|
|
221
223
|
if (!await fileExists2(absoluteTestPath)) {
|
|
222
224
|
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
@@ -248,62 +250,39 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
248
250
|
const results = [];
|
|
249
251
|
for (const rawEvalcase of rawTestcases) {
|
|
250
252
|
if (!isJsonObject(rawEvalcase)) {
|
|
251
|
-
logWarning("Skipping invalid
|
|
253
|
+
logWarning("Skipping invalid eval case entry (expected object)");
|
|
252
254
|
continue;
|
|
253
255
|
}
|
|
254
256
|
const evalcase = rawEvalcase;
|
|
255
257
|
const id = asString(evalcase.id);
|
|
258
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
256
261
|
const conversationId = asString(evalcase.conversation_id);
|
|
257
262
|
const outcome = asString(evalcase.outcome);
|
|
258
263
|
const inputMessagesValue = evalcase.input_messages;
|
|
259
264
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
260
265
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
261
|
-
logWarning(`Skipping incomplete
|
|
266
|
+
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
262
267
|
continue;
|
|
263
268
|
}
|
|
264
269
|
if (!Array.isArray(expectedMessagesValue)) {
|
|
265
|
-
logWarning(`
|
|
270
|
+
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
266
271
|
continue;
|
|
267
272
|
}
|
|
268
273
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
269
274
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
273
|
-
if (assistantMessages.length === 0) {
|
|
274
|
-
logWarning(`No assistant message found for test case: ${id}`);
|
|
275
|
+
if (expectedMessages.length === 0) {
|
|
276
|
+
logWarning(`No expected message found for eval case: ${id}`);
|
|
275
277
|
continue;
|
|
276
278
|
}
|
|
277
|
-
if (
|
|
278
|
-
logWarning(`Multiple
|
|
279
|
-
}
|
|
280
|
-
if (systemMessages.length > 1) {
|
|
281
|
-
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
282
|
-
}
|
|
283
|
-
let systemMessageContent;
|
|
284
|
-
if (systemMessages.length > 0) {
|
|
285
|
-
const content = systemMessages[0]?.content;
|
|
286
|
-
if (typeof content === "string") {
|
|
287
|
-
systemMessageContent = content;
|
|
288
|
-
} else if (Array.isArray(content)) {
|
|
289
|
-
const textParts = [];
|
|
290
|
-
for (const segment of content) {
|
|
291
|
-
if (isJsonObject(segment)) {
|
|
292
|
-
const value = segment.value;
|
|
293
|
-
if (typeof value === "string") {
|
|
294
|
-
textParts.push(value);
|
|
295
|
-
}
|
|
296
|
-
}
|
|
297
|
-
}
|
|
298
|
-
if (textParts.length > 0) {
|
|
299
|
-
systemMessageContent = textParts.join("\n\n");
|
|
300
|
-
}
|
|
301
|
-
}
|
|
279
|
+
if (expectedMessages.length > 1) {
|
|
280
|
+
logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
302
281
|
}
|
|
303
282
|
const guidelinePaths = [];
|
|
304
283
|
const inputTextParts = [];
|
|
305
284
|
const inputSegments = await processMessages({
|
|
306
|
-
messages:
|
|
285
|
+
messages: inputMessages,
|
|
307
286
|
searchRoots,
|
|
308
287
|
repoRootPath,
|
|
309
288
|
guidelinePatterns,
|
|
@@ -313,7 +292,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
313
292
|
verbose
|
|
314
293
|
});
|
|
315
294
|
const outputSegments = await processMessages({
|
|
316
|
-
messages:
|
|
295
|
+
messages: expectedMessages,
|
|
317
296
|
searchRoots,
|
|
318
297
|
repoRootPath,
|
|
319
298
|
guidelinePatterns,
|
|
@@ -321,10 +300,10 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
321
300
|
verbose
|
|
322
301
|
});
|
|
323
302
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
324
|
-
const
|
|
325
|
-
const referenceAnswer = await resolveAssistantContent(
|
|
303
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
304
|
+
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
326
305
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
327
|
-
const
|
|
306
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
328
307
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
329
308
|
const userFilePaths = [];
|
|
330
309
|
for (const segment of inputSegments) {
|
|
@@ -343,19 +322,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
343
322
|
question,
|
|
344
323
|
input_segments: inputSegments,
|
|
345
324
|
output_segments: outputSegments,
|
|
346
|
-
system_message: systemMessageContent,
|
|
347
325
|
reference_answer: referenceAnswer,
|
|
348
326
|
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
349
327
|
guideline_patterns: guidelinePatterns,
|
|
350
328
|
file_paths: allFilePaths,
|
|
351
329
|
code_snippets: codeSnippets,
|
|
352
330
|
expected_outcome: outcome,
|
|
353
|
-
evaluator:
|
|
331
|
+
evaluator: evalCaseEvaluatorKind,
|
|
354
332
|
evaluators
|
|
355
333
|
};
|
|
356
334
|
if (verbose) {
|
|
357
335
|
console.log(`
|
|
358
|
-
[
|
|
336
|
+
[Eval Case: ${id}]`);
|
|
359
337
|
if (testCase.guideline_paths.length > 0) {
|
|
360
338
|
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
361
339
|
for (const guidelinePath of testCase.guideline_paths) {
|
|
@@ -414,7 +392,7 @@ ${body}`);
|
|
|
414
392
|
}
|
|
415
393
|
const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
416
394
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
417
|
-
return { question, guidelines
|
|
395
|
+
return { question, guidelines };
|
|
418
396
|
}
|
|
419
397
|
async function fileExists2(absolutePath) {
|
|
420
398
|
try {
|
|
@@ -1010,7 +988,7 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
1010
988
|
import { exec as execCallback, spawn } from "node:child_process";
|
|
1011
989
|
import { randomUUID } from "node:crypto";
|
|
1012
990
|
import { constants as constants2, createWriteStream } from "node:fs";
|
|
1013
|
-
import { access as access2,
|
|
991
|
+
import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
|
|
1014
992
|
import { tmpdir } from "node:os";
|
|
1015
993
|
import path4 from "node:path";
|
|
1016
994
|
import { promisify as promisify2 } from "node:util";
|
|
@@ -1173,7 +1151,6 @@ function pathToFileUri(filePath) {
|
|
|
1173
1151
|
var execAsync2 = promisify2(execCallback);
|
|
1174
1152
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
1175
1153
|
var PROMPT_FILENAME = "prompt.md";
|
|
1176
|
-
var FILES_DIR = "files";
|
|
1177
1154
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
1178
1155
|
var CodexProvider = class {
|
|
1179
1156
|
id;
|
|
@@ -1196,21 +1173,10 @@ var CodexProvider = class {
|
|
|
1196
1173
|
}
|
|
1197
1174
|
await this.ensureEnvironmentReady();
|
|
1198
1175
|
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
1199
|
-
const originalGuidelines = new Set(
|
|
1200
|
-
collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => path4.resolve(file))
|
|
1201
|
-
);
|
|
1202
1176
|
const workspaceRoot = await this.createWorkspace();
|
|
1203
1177
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1204
1178
|
try {
|
|
1205
|
-
const
|
|
1206
|
-
inputFiles,
|
|
1207
|
-
workspaceRoot,
|
|
1208
|
-
originalGuidelines
|
|
1209
|
-
);
|
|
1210
|
-
const promptContent = buildPromptDocument(request, mirroredInputFiles, {
|
|
1211
|
-
guidelinePatterns: request.guideline_patterns,
|
|
1212
|
-
guidelineOverrides: guidelineMirrors
|
|
1213
|
-
});
|
|
1179
|
+
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1214
1180
|
const promptFile = path4.join(workspaceRoot, PROMPT_FILENAME);
|
|
1215
1181
|
await writeFile(promptFile, promptContent, "utf8");
|
|
1216
1182
|
const args = this.buildCodexArgs();
|
|
@@ -1239,7 +1205,7 @@ var CodexProvider = class {
|
|
|
1239
1205
|
executable: this.resolvedExecutable ?? this.config.executable,
|
|
1240
1206
|
promptFile,
|
|
1241
1207
|
workspace: workspaceRoot,
|
|
1242
|
-
inputFiles
|
|
1208
|
+
inputFiles,
|
|
1243
1209
|
logFile: logger?.filePath
|
|
1244
1210
|
}
|
|
1245
1211
|
};
|
|
@@ -1294,37 +1260,6 @@ var CodexProvider = class {
|
|
|
1294
1260
|
throw error;
|
|
1295
1261
|
}
|
|
1296
1262
|
}
|
|
1297
|
-
async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
|
|
1298
|
-
if (!inputFiles || inputFiles.length === 0) {
|
|
1299
|
-
return {
|
|
1300
|
-
mirroredInputFiles: void 0,
|
|
1301
|
-
guidelineMirrors: /* @__PURE__ */ new Set()
|
|
1302
|
-
};
|
|
1303
|
-
}
|
|
1304
|
-
const filesRoot = path4.join(workspaceRoot, FILES_DIR);
|
|
1305
|
-
await mkdir(filesRoot, { recursive: true });
|
|
1306
|
-
const mirrored = [];
|
|
1307
|
-
const guidelineMirrors = /* @__PURE__ */ new Set();
|
|
1308
|
-
const nameCounts = /* @__PURE__ */ new Map();
|
|
1309
|
-
for (const inputFile of inputFiles) {
|
|
1310
|
-
const absoluteSource = path4.resolve(inputFile);
|
|
1311
|
-
const baseName = path4.basename(absoluteSource);
|
|
1312
|
-
const count = nameCounts.get(baseName) ?? 0;
|
|
1313
|
-
nameCounts.set(baseName, count + 1);
|
|
1314
|
-
const finalName = count === 0 ? baseName : `${baseName}.${count}`;
|
|
1315
|
-
const destination = path4.join(filesRoot, finalName);
|
|
1316
|
-
await copyFile(absoluteSource, destination);
|
|
1317
|
-
const resolvedDestination = path4.resolve(destination);
|
|
1318
|
-
mirrored.push(resolvedDestination);
|
|
1319
|
-
if (guidelineOriginals.has(absoluteSource)) {
|
|
1320
|
-
guidelineMirrors.add(resolvedDestination);
|
|
1321
|
-
}
|
|
1322
|
-
}
|
|
1323
|
-
return {
|
|
1324
|
-
mirroredInputFiles: mirrored,
|
|
1325
|
-
guidelineMirrors
|
|
1326
|
-
};
|
|
1327
|
-
}
|
|
1328
1263
|
async createWorkspace() {
|
|
1329
1264
|
return await mkdtemp(path4.join(tmpdir(), WORKSPACE_PREFIX));
|
|
1330
1265
|
}
|
|
@@ -2295,23 +2230,25 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
2295
2230
|
if (trimmed.length === 0) {
|
|
2296
2231
|
return void 0;
|
|
2297
2232
|
}
|
|
2298
|
-
const
|
|
2299
|
-
if (
|
|
2300
|
-
|
|
2301
|
-
|
|
2233
|
+
const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
2234
|
+
if (envVarMatch) {
|
|
2235
|
+
const varName = envVarMatch[1];
|
|
2236
|
+
const envValue = env[varName];
|
|
2237
|
+
if (envValue !== void 0) {
|
|
2238
|
+
if (envValue.trim().length === 0) {
|
|
2239
|
+
throw new Error(`Environment variable '${varName}' for ${description} is empty`);
|
|
2240
|
+
}
|
|
2241
|
+
return envValue;
|
|
2302
2242
|
}
|
|
2303
|
-
|
|
2304
|
-
}
|
|
2305
|
-
const allowLiteral = options?.allowLiteral ?? false;
|
|
2306
|
-
const optionalEnv = options?.optionalEnv ?? false;
|
|
2307
|
-
const looksLikeEnv = isLikelyEnvReference(trimmed);
|
|
2308
|
-
if (looksLikeEnv) {
|
|
2243
|
+
const optionalEnv = options?.optionalEnv ?? false;
|
|
2309
2244
|
if (optionalEnv) {
|
|
2310
2245
|
return void 0;
|
|
2311
2246
|
}
|
|
2312
|
-
|
|
2313
|
-
|
|
2314
|
-
|
|
2247
|
+
throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
|
|
2248
|
+
}
|
|
2249
|
+
const allowLiteral = options?.allowLiteral ?? false;
|
|
2250
|
+
if (!allowLiteral) {
|
|
2251
|
+
throw new Error(`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`);
|
|
2315
2252
|
}
|
|
2316
2253
|
return trimmed;
|
|
2317
2254
|
}
|
|
@@ -2358,9 +2295,6 @@ function resolveOptionalBoolean(source) {
|
|
|
2358
2295
|
}
|
|
2359
2296
|
throw new Error("expected boolean value");
|
|
2360
2297
|
}
|
|
2361
|
-
function isLikelyEnvReference(value) {
|
|
2362
|
-
return /^[A-Z0-9_]+$/.test(value);
|
|
2363
|
-
}
|
|
2364
2298
|
function resolveOptionalStringArray(source, env, description) {
|
|
2365
2299
|
if (source === void 0 || source === null) {
|
|
2366
2300
|
return void 0;
|
|
@@ -2381,21 +2315,25 @@ function resolveOptionalStringArray(source, env, description) {
|
|
|
2381
2315
|
if (trimmed.length === 0) {
|
|
2382
2316
|
throw new Error(`${description}[${i}] cannot be empty`);
|
|
2383
2317
|
}
|
|
2384
|
-
const
|
|
2385
|
-
if (
|
|
2386
|
-
|
|
2387
|
-
|
|
2318
|
+
const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
2319
|
+
if (envVarMatch) {
|
|
2320
|
+
const varName = envVarMatch[1];
|
|
2321
|
+
const envValue = env[varName];
|
|
2322
|
+
if (envValue !== void 0) {
|
|
2323
|
+
if (envValue.trim().length === 0) {
|
|
2324
|
+
throw new Error(`Environment variable '${varName}' for ${description}[${i}] is empty`);
|
|
2325
|
+
}
|
|
2326
|
+
resolved.push(envValue);
|
|
2327
|
+
continue;
|
|
2388
2328
|
}
|
|
2389
|
-
|
|
2390
|
-
} else {
|
|
2391
|
-
resolved.push(trimmed);
|
|
2329
|
+
throw new Error(`Environment variable '${varName}' for ${description}[${i}] is not set`);
|
|
2392
2330
|
}
|
|
2331
|
+
resolved.push(trimmed);
|
|
2393
2332
|
}
|
|
2394
2333
|
return resolved.length > 0 ? resolved : void 0;
|
|
2395
2334
|
}
|
|
2396
2335
|
|
|
2397
2336
|
// src/evaluation/providers/vscode.ts
|
|
2398
|
-
import { readFile as readFile2 } from "node:fs/promises";
|
|
2399
2337
|
import path5 from "node:path";
|
|
2400
2338
|
import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
|
|
2401
2339
|
var VSCodeProvider = class {
|
|
@@ -2439,7 +2377,7 @@ var VSCodeProvider = class {
|
|
|
2439
2377
|
}
|
|
2440
2378
|
};
|
|
2441
2379
|
}
|
|
2442
|
-
const responseText = await
|
|
2380
|
+
const responseText = await readTextFile(session.responseFile);
|
|
2443
2381
|
return {
|
|
2444
2382
|
text: responseText,
|
|
2445
2383
|
raw: {
|
|
@@ -2493,7 +2431,7 @@ var VSCodeProvider = class {
|
|
|
2493
2431
|
}
|
|
2494
2432
|
const responses = [];
|
|
2495
2433
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
2496
|
-
const responseText = await
|
|
2434
|
+
const responseText = await readTextFile(responseFile);
|
|
2497
2435
|
responses.push({
|
|
2498
2436
|
text: responseText,
|
|
2499
2437
|
raw: {
|
|
@@ -2643,7 +2581,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
2643
2581
|
|
|
2644
2582
|
// src/evaluation/providers/targets-file.ts
|
|
2645
2583
|
import { constants as constants3 } from "node:fs";
|
|
2646
|
-
import { access as access3, readFile as
|
|
2584
|
+
import { access as access3, readFile as readFile2 } from "node:fs/promises";
|
|
2647
2585
|
import path6 from "node:path";
|
|
2648
2586
|
import { parse as parse2 } from "yaml";
|
|
2649
2587
|
function isRecord(value) {
|
|
@@ -2711,7 +2649,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
2711
2649
|
if (!await fileExists3(absolutePath)) {
|
|
2712
2650
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
2713
2651
|
}
|
|
2714
|
-
const raw = await
|
|
2652
|
+
const raw = await readFile2(absolutePath, "utf8");
|
|
2715
2653
|
const parsed = parse2(raw);
|
|
2716
2654
|
if (!isRecord(parsed)) {
|
|
2717
2655
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
@@ -2957,7 +2895,6 @@ var CodeEvaluator = class {
|
|
|
2957
2895
|
expected_outcome: context.evalCase.expected_outcome,
|
|
2958
2896
|
reference_answer: context.evalCase.reference_answer,
|
|
2959
2897
|
candidate_answer: context.candidate,
|
|
2960
|
-
system_message: context.promptInputs.systemMessage ?? "",
|
|
2961
2898
|
guideline_paths: context.evalCase.guideline_paths,
|
|
2962
2899
|
input_files: context.evalCase.file_paths,
|
|
2963
2900
|
input_segments: context.evalCase.input_segments
|
|
@@ -3199,7 +3136,7 @@ function validateConcurrency(concurrency) {
|
|
|
3199
3136
|
// src/evaluation/orchestrator.ts
|
|
3200
3137
|
async function runEvaluation(options) {
|
|
3201
3138
|
const {
|
|
3202
|
-
testFilePath,
|
|
3139
|
+
testFilePath: evalFilePath,
|
|
3203
3140
|
repoRoot,
|
|
3204
3141
|
target,
|
|
3205
3142
|
targets,
|
|
@@ -3218,11 +3155,11 @@ async function runEvaluation(options) {
|
|
|
3218
3155
|
onProgress
|
|
3219
3156
|
} = options;
|
|
3220
3157
|
const load = loadEvalCases;
|
|
3221
|
-
const evalCases = await load(
|
|
3158
|
+
const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
|
|
3222
3159
|
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
3223
3160
|
if (filteredEvalCases.length === 0) {
|
|
3224
3161
|
if (evalId) {
|
|
3225
|
-
throw new Error(`
|
|
3162
|
+
throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
|
|
3226
3163
|
}
|
|
3227
3164
|
return [];
|
|
3228
3165
|
}
|
|
@@ -3376,7 +3313,8 @@ async function runEvaluation(options) {
|
|
|
3376
3313
|
target.name,
|
|
3377
3314
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
3378
3315
|
outcome.reason,
|
|
3379
|
-
promptInputs
|
|
3316
|
+
promptInputs,
|
|
3317
|
+
primaryProvider
|
|
3380
3318
|
);
|
|
3381
3319
|
results.push(errorResult);
|
|
3382
3320
|
if (onResult) {
|
|
@@ -3460,7 +3398,7 @@ async function runBatchEvaluation(options) {
|
|
|
3460
3398
|
agentTimeoutMs
|
|
3461
3399
|
});
|
|
3462
3400
|
} catch (error) {
|
|
3463
|
-
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3401
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3464
3402
|
results.push(errorResult);
|
|
3465
3403
|
if (onResult) {
|
|
3466
3404
|
await onResult(errorResult);
|
|
@@ -3537,7 +3475,7 @@ async function runEvalCase(options) {
|
|
|
3537
3475
|
attempt += 1;
|
|
3538
3476
|
continue;
|
|
3539
3477
|
}
|
|
3540
|
-
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3478
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3541
3479
|
}
|
|
3542
3480
|
}
|
|
3543
3481
|
if (!providerResponse) {
|
|
@@ -3546,7 +3484,8 @@ async function runEvalCase(options) {
|
|
|
3546
3484
|
target.name,
|
|
3547
3485
|
nowFn(),
|
|
3548
3486
|
lastError ?? new Error("Provider did not return a response"),
|
|
3549
|
-
promptInputs
|
|
3487
|
+
promptInputs,
|
|
3488
|
+
provider
|
|
3550
3489
|
);
|
|
3551
3490
|
}
|
|
3552
3491
|
if (cacheKey && cache && !cachedResponse) {
|
|
@@ -3566,7 +3505,7 @@ async function runEvalCase(options) {
|
|
|
3566
3505
|
agentTimeoutMs
|
|
3567
3506
|
});
|
|
3568
3507
|
} catch (error) {
|
|
3569
|
-
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
3508
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
|
|
3570
3509
|
}
|
|
3571
3510
|
}
|
|
3572
3511
|
async function evaluateCandidate(options) {
|
|
@@ -3598,9 +3537,8 @@ async function evaluateCandidate(options) {
|
|
|
3598
3537
|
const completedAt = nowFn();
|
|
3599
3538
|
const rawRequest = {
|
|
3600
3539
|
question: promptInputs.question,
|
|
3601
|
-
guidelines: promptInputs.guidelines,
|
|
3602
|
-
guideline_paths: evalCase.guideline_paths
|
|
3603
|
-
system_message: promptInputs.systemMessage ?? ""
|
|
3540
|
+
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
3541
|
+
guideline_paths: evalCase.guideline_paths
|
|
3604
3542
|
};
|
|
3605
3543
|
return {
|
|
3606
3544
|
eval_id: evalCase.id,
|
|
@@ -3858,13 +3796,12 @@ async function invokeProvider(provider, options) {
|
|
|
3858
3796
|
}
|
|
3859
3797
|
}
|
|
3860
3798
|
}
|
|
3861
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
3799
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
3862
3800
|
const message = error instanceof Error ? error.message : String(error);
|
|
3863
3801
|
const rawRequest = {
|
|
3864
3802
|
question: promptInputs.question,
|
|
3865
|
-
guidelines: promptInputs.guidelines,
|
|
3803
|
+
...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
|
|
3866
3804
|
guideline_paths: evalCase.guideline_paths,
|
|
3867
|
-
system_message: promptInputs.systemMessage ?? "",
|
|
3868
3805
|
error: message
|
|
3869
3806
|
};
|
|
3870
3807
|
return {
|