@agentv/core 0.6.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -299,6 +299,87 @@ function extractCodeBlocks(segments) {
299
299
  }
300
300
  return codeBlocks;
301
301
  }
302
+ async function processMessages(options) {
303
+ const {
304
+ messages,
305
+ searchRoots,
306
+ repoRootPath,
307
+ guidelinePatterns,
308
+ guidelinePaths,
309
+ textParts,
310
+ messageType,
311
+ verbose
312
+ } = options;
313
+ const segments = [];
314
+ for (const message of messages) {
315
+ const content = message.content;
316
+ if (typeof content === "string") {
317
+ segments.push({ type: "text", value: content });
318
+ if (textParts) {
319
+ textParts.push(content);
320
+ }
321
+ continue;
322
+ }
323
+ for (const rawSegment of content) {
324
+ if (!isJsonObject(rawSegment)) {
325
+ continue;
326
+ }
327
+ const segmentType = asString(rawSegment.type);
328
+ if (segmentType === "file") {
329
+ const rawValue = asString(rawSegment.value);
330
+ if (!rawValue) {
331
+ continue;
332
+ }
333
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
334
+ rawValue,
335
+ searchRoots
336
+ );
337
+ if (!resolvedPath) {
338
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
339
+ const context = messageType === "input" ? "" : " in expected_messages";
340
+ logWarning(`File not found${context}: ${displayPath}`, attempts);
341
+ continue;
342
+ }
343
+ try {
344
+ const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
345
+ if (messageType === "input" && guidelinePatterns && guidelinePaths) {
346
+ const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
347
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
348
+ guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
349
+ if (verbose) {
350
+ console.log(` [Guideline] Found: ${displayPath}`);
351
+ console.log(` Resolved to: ${resolvedPath}`);
352
+ }
353
+ continue;
354
+ }
355
+ }
356
+ segments.push({
357
+ type: "file",
358
+ path: displayPath,
359
+ text: fileContent,
360
+ resolvedPath: import_node_path2.default.resolve(resolvedPath)
361
+ });
362
+ if (verbose) {
363
+ const label = messageType === "input" ? "[File]" : "[Expected Output File]";
364
+ console.log(` ${label} Found: ${displayPath}`);
365
+ console.log(` Resolved to: ${resolvedPath}`);
366
+ }
367
+ } catch (error) {
368
+ const context = messageType === "input" ? "" : " expected output";
369
+ logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
370
+ }
371
+ continue;
372
+ }
373
+ const clonedSegment = cloneJsonObject(rawSegment);
374
+ segments.push(clonedSegment);
375
+ const inlineValue = clonedSegment.value;
376
+ if (typeof inlineValue === "string" && textParts) {
377
+ textParts.push(inlineValue);
378
+ }
379
+ }
380
+ }
381
+ return segments;
382
+ }
302
383
  async function loadEvalCases(evalFilePath, repoRoot, options) {
303
384
  const verbose = options?.verbose ?? false;
304
385
  const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
@@ -384,77 +465,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
384
465
  }
385
466
  }
386
467
  }
387
- const userSegments = [];
388
468
  const guidelinePaths = [];
389
- const userTextParts = [];
390
- for (const userMessage of userMessages) {
391
- const content = userMessage.content;
392
- if (typeof content === "string") {
393
- userSegments.push({ type: "text", value: content });
394
- userTextParts.push(content);
395
- continue;
396
- }
397
- for (const rawSegment of content) {
398
- if (!isJsonObject(rawSegment)) {
399
- continue;
400
- }
401
- const segmentType = asString(rawSegment.type);
402
- if (segmentType === "file") {
403
- const rawValue = asString(rawSegment.value);
404
- if (!rawValue) {
405
- continue;
406
- }
407
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
408
- rawValue,
409
- searchRoots
410
- );
411
- if (!resolvedPath) {
412
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
413
- logWarning(`File not found: ${displayPath}`, attempts);
414
- continue;
415
- }
416
- try {
417
- const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
418
- const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
419
- if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
420
- guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
421
- if (verbose) {
422
- console.log(` [Guideline] Found: ${displayPath}`);
423
- console.log(` Resolved to: ${resolvedPath}`);
424
- }
425
- } else {
426
- userSegments.push({
427
- type: "file",
428
- path: displayPath,
429
- text: fileContent,
430
- resolvedPath: import_node_path2.default.resolve(resolvedPath)
431
- });
432
- if (verbose) {
433
- console.log(` [File] Found: ${displayPath}`);
434
- console.log(` Resolved to: ${resolvedPath}`);
435
- }
436
- }
437
- } catch (error) {
438
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
439
- }
440
- continue;
441
- }
442
- const clonedSegment = cloneJsonObject(rawSegment);
443
- userSegments.push(clonedSegment);
444
- const inlineValue = clonedSegment.value;
445
- if (typeof inlineValue === "string") {
446
- userTextParts.push(inlineValue);
447
- }
448
- }
449
- }
450
- const codeSnippets = extractCodeBlocks(userSegments);
469
+ const inputTextParts = [];
470
+ const inputSegments = await processMessages({
471
+ messages: userMessages,
472
+ searchRoots,
473
+ repoRootPath,
474
+ guidelinePatterns,
475
+ guidelinePaths,
476
+ textParts: inputTextParts,
477
+ messageType: "input",
478
+ verbose
479
+ });
480
+ const outputSegments = await processMessages({
481
+ messages: assistantMessages,
482
+ searchRoots,
483
+ repoRootPath,
484
+ guidelinePatterns,
485
+ messageType: "output",
486
+ verbose
487
+ });
488
+ const codeSnippets = extractCodeBlocks(inputSegments);
451
489
  const assistantContent = assistantMessages[0]?.content;
452
- const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
453
- const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
490
+ const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
491
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
454
492
  const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
455
493
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
456
494
  const userFilePaths = [];
457
- for (const segment of userSegments) {
495
+ for (const segment of inputSegments) {
458
496
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
459
497
  userFilePaths.push(segment.resolvedPath);
460
498
  }
@@ -467,15 +505,16 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
467
505
  id,
468
506
  dataset: datasetName,
469
507
  conversation_id: conversationId,
470
- task: userTextPrompt,
471
- user_segments: userSegments,
508
+ question,
509
+ input_segments: inputSegments,
510
+ output_segments: outputSegments,
472
511
  system_message: systemMessageContent,
473
- expected_assistant_raw: expectedAssistantRaw,
512
+ reference_answer: referenceAnswer,
474
513
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
475
514
  guideline_patterns: guidelinePatterns,
476
515
  file_paths: allFilePaths,
477
516
  code_snippets: codeSnippets,
478
- outcome,
517
+ expected_outcome: outcome,
479
518
  evaluator: testCaseEvaluatorKind,
480
519
  evaluators
481
520
  };
@@ -511,36 +550,36 @@ ${content}`);
511
550
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
512
551
  }
513
552
  }
514
- const requestParts = [];
515
- for (const segment of testCase.user_segments) {
553
+ const questionParts = [];
554
+ for (const segment of testCase.input_segments) {
516
555
  const typeValue = segment.type;
517
556
  if (typeof typeValue === "string" && typeValue === "file") {
518
557
  const pathValue = segment.path;
519
558
  const textValue = segment.text;
520
559
  const label = typeof pathValue === "string" ? pathValue : "file";
521
560
  const body = typeof textValue === "string" ? textValue : "";
522
- requestParts.push(`=== ${label} ===
561
+ questionParts.push(`=== ${label} ===
523
562
  ${body}`);
524
563
  continue;
525
564
  }
526
565
  if (typeof typeValue === "string" && typeValue === "text") {
527
566
  const value = segment.value;
528
567
  if (typeof value === "string") {
529
- requestParts.push(value);
568
+ questionParts.push(value);
530
569
  }
531
570
  continue;
532
571
  }
533
572
  const genericValue = segment.value;
534
573
  if (typeof genericValue === "string") {
535
- requestParts.push(genericValue);
574
+ questionParts.push(genericValue);
536
575
  }
537
576
  }
538
577
  if (testCase.code_snippets.length > 0) {
539
- requestParts.push(testCase.code_snippets.join("\n"));
578
+ questionParts.push(testCase.code_snippets.join("\n"));
540
579
  }
541
- const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
580
+ const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
542
581
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
543
- return { request, guidelines, systemMessage: testCase.system_message };
582
+ return { question, guidelines, systemMessage: testCase.system_message };
544
583
  }
545
584
  async function fileExists2(absolutePath) {
546
585
  try {
@@ -752,7 +791,7 @@ function buildChatPrompt(request) {
752
791
  ${request.guidelines.trim()}`);
753
792
  }
754
793
  const systemContent = systemSegments.join("\n\n");
755
- const userContent = request.prompt.trim();
794
+ const userContent = request.question.trim();
756
795
  const prompt = [
757
796
  {
758
797
  role: "system",
@@ -1050,7 +1089,7 @@ var CliProvider = class {
1050
1089
  healthcheck.commandTemplate,
1051
1090
  buildTemplateValues(
1052
1091
  {
1053
- prompt: "",
1092
+ question: "",
1054
1093
  guidelines: "",
1055
1094
  inputFiles: [],
1056
1095
  evalCaseId: "",
@@ -1077,7 +1116,7 @@ var CliProvider = class {
1077
1116
  function buildTemplateValues(request, config) {
1078
1117
  const inputFiles = normalizeInputFiles(request.inputFiles);
1079
1118
  return {
1080
- PROMPT: shellEscape(request.prompt ?? ""),
1119
+ PROMPT: shellEscape(request.question ?? ""),
1081
1120
  GUIDELINES: shellEscape(request.guidelines ?? ""),
1082
1121
  EVAL_ID: shellEscape(request.evalCaseId ?? ""),
1083
1122
  ATTEMPT: shellEscape(String(request.attempt ?? 0)),
@@ -1141,6 +1180,59 @@ var import_node_os = require("os");
1141
1180
  var import_node_path5 = __toESM(require("path"), 1);
1142
1181
  var import_node_util2 = require("util");
1143
1182
 
1183
+ // src/evaluation/providers/codex-log-tracker.ts
1184
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1185
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1186
+ function getCodexLogStore() {
1187
+ const globalObject = globalThis;
1188
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1189
+ if (existing) {
1190
+ return existing;
1191
+ }
1192
+ const created = [];
1193
+ globalObject[GLOBAL_LOGS_KEY] = created;
1194
+ return created;
1195
+ }
1196
+ function getSubscriberStore() {
1197
+ const globalObject = globalThis;
1198
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1199
+ if (existing) {
1200
+ return existing;
1201
+ }
1202
+ const created = /* @__PURE__ */ new Set();
1203
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1204
+ return created;
1205
+ }
1206
+ function notifySubscribers(entry) {
1207
+ const subscribers = Array.from(getSubscriberStore());
1208
+ for (const listener of subscribers) {
1209
+ try {
1210
+ listener(entry);
1211
+ } catch (error) {
1212
+ const message = error instanceof Error ? error.message : String(error);
1213
+ console.warn(`Codex log subscriber failed: ${message}`);
1214
+ }
1215
+ }
1216
+ }
1217
+ function recordCodexLogEntry(entry) {
1218
+ getCodexLogStore().push(entry);
1219
+ notifySubscribers(entry);
1220
+ }
1221
+ function consumeCodexLogEntries() {
1222
+ const store = getCodexLogStore();
1223
+ if (store.length === 0) {
1224
+ return [];
1225
+ }
1226
+ return store.splice(0, store.length);
1227
+ }
1228
+ function subscribeToCodexLogEntries(listener) {
1229
+ const store = getSubscriberStore();
1230
+ store.add(listener);
1231
+ return () => {
1232
+ store.delete(listener);
1233
+ };
1234
+ }
1235
+
1144
1236
  // src/evaluation/providers/preread.ts
1145
1237
  var import_node_path4 = __toESM(require("path"), 1);
1146
1238
  function buildPromptDocument(request, inputFiles, options) {
@@ -1158,7 +1250,7 @@ function buildPromptDocument(request, inputFiles, options) {
1158
1250
  if (prereadBlock.length > 0) {
1159
1251
  parts.push("\n", prereadBlock);
1160
1252
  }
1161
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1253
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
1162
1254
  return parts.join("\n").trim();
1163
1255
  }
1164
1256
  function normalizeInputFiles2(inputFiles) {
@@ -1242,64 +1334,10 @@ function pathToFileUri(filePath) {
1242
1334
  return `file://${normalizedPath}`;
1243
1335
  }
1244
1336
 
1245
- // src/evaluation/providers/codex-log-tracker.ts
1246
- var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1247
- var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1248
- function getCodexLogStore() {
1249
- const globalObject = globalThis;
1250
- const existing = globalObject[GLOBAL_LOGS_KEY];
1251
- if (existing) {
1252
- return existing;
1253
- }
1254
- const created = [];
1255
- globalObject[GLOBAL_LOGS_KEY] = created;
1256
- return created;
1257
- }
1258
- function getSubscriberStore() {
1259
- const globalObject = globalThis;
1260
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1261
- if (existing) {
1262
- return existing;
1263
- }
1264
- const created = /* @__PURE__ */ new Set();
1265
- globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1266
- return created;
1267
- }
1268
- function notifySubscribers(entry) {
1269
- const subscribers = Array.from(getSubscriberStore());
1270
- for (const listener of subscribers) {
1271
- try {
1272
- listener(entry);
1273
- } catch (error) {
1274
- const message = error instanceof Error ? error.message : String(error);
1275
- console.warn(`Codex log subscriber failed: ${message}`);
1276
- }
1277
- }
1278
- }
1279
- function recordCodexLogEntry(entry) {
1280
- getCodexLogStore().push(entry);
1281
- notifySubscribers(entry);
1282
- }
1283
- function consumeCodexLogEntries() {
1284
- const store = getCodexLogStore();
1285
- if (store.length === 0) {
1286
- return [];
1287
- }
1288
- return store.splice(0, store.length);
1289
- }
1290
- function subscribeToCodexLogEntries(listener) {
1291
- const store = getSubscriberStore();
1292
- store.add(listener);
1293
- return () => {
1294
- store.delete(listener);
1295
- };
1296
- }
1297
-
1298
1337
  // src/evaluation/providers/codex.ts
1299
1338
  var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
1300
1339
  var WORKSPACE_PREFIX = "agentv-codex-";
1301
1340
  var PROMPT_FILENAME = "prompt.md";
1302
- var FILES_DIR = "files";
1303
1341
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
1304
1342
  var CodexProvider = class {
1305
1343
  id;
@@ -1322,21 +1360,10 @@ var CodexProvider = class {
1322
1360
  }
1323
1361
  await this.ensureEnvironmentReady();
1324
1362
  const inputFiles = normalizeInputFiles2(request.inputFiles);
1325
- const originalGuidelines = new Set(
1326
- collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => import_node_path5.default.resolve(file))
1327
- );
1328
1363
  const workspaceRoot = await this.createWorkspace();
1329
1364
  const logger = await this.createStreamLogger(request).catch(() => void 0);
1330
1365
  try {
1331
- const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
1332
- inputFiles,
1333
- workspaceRoot,
1334
- originalGuidelines
1335
- );
1336
- const promptContent = buildPromptDocument(request, mirroredInputFiles, {
1337
- guidelinePatterns: request.guideline_patterns,
1338
- guidelineOverrides: guidelineMirrors
1339
- });
1366
+ const promptContent = buildPromptDocument(request, inputFiles);
1340
1367
  const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
1341
1368
  await (0, import_promises3.writeFile)(promptFile, promptContent, "utf8");
1342
1369
  const args = this.buildCodexArgs();
@@ -1365,7 +1392,7 @@ var CodexProvider = class {
1365
1392
  executable: this.resolvedExecutable ?? this.config.executable,
1366
1393
  promptFile,
1367
1394
  workspace: workspaceRoot,
1368
- inputFiles: mirroredInputFiles,
1395
+ inputFiles,
1369
1396
  logFile: logger?.filePath
1370
1397
  }
1371
1398
  };
@@ -1420,37 +1447,6 @@ var CodexProvider = class {
1420
1447
  throw error;
1421
1448
  }
1422
1449
  }
1423
- async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
1424
- if (!inputFiles || inputFiles.length === 0) {
1425
- return {
1426
- mirroredInputFiles: void 0,
1427
- guidelineMirrors: /* @__PURE__ */ new Set()
1428
- };
1429
- }
1430
- const filesRoot = import_node_path5.default.join(workspaceRoot, FILES_DIR);
1431
- await (0, import_promises3.mkdir)(filesRoot, { recursive: true });
1432
- const mirrored = [];
1433
- const guidelineMirrors = /* @__PURE__ */ new Set();
1434
- const nameCounts = /* @__PURE__ */ new Map();
1435
- for (const inputFile of inputFiles) {
1436
- const absoluteSource = import_node_path5.default.resolve(inputFile);
1437
- const baseName = import_node_path5.default.basename(absoluteSource);
1438
- const count = nameCounts.get(baseName) ?? 0;
1439
- nameCounts.set(baseName, count + 1);
1440
- const finalName = count === 0 ? baseName : `${baseName}.${count}`;
1441
- const destination = import_node_path5.default.join(filesRoot, finalName);
1442
- await (0, import_promises3.copyFile)(absoluteSource, destination);
1443
- const resolvedDestination = import_node_path5.default.resolve(destination);
1444
- mirrored.push(resolvedDestination);
1445
- if (guidelineOriginals.has(absoluteSource)) {
1446
- guidelineMirrors.add(resolvedDestination);
1447
- }
1448
- }
1449
- return {
1450
- mirroredInputFiles: mirrored,
1451
- guidelineMirrors
1452
- };
1453
- }
1454
1450
  async createWorkspace() {
1455
1451
  return await (0, import_promises3.mkdtemp)(import_node_path5.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
1456
1452
  }
@@ -2028,7 +2024,7 @@ var MockProvider = class {
2028
2024
  return {
2029
2025
  text: this.cannedResponse,
2030
2026
  raw: {
2031
- prompt: request.prompt,
2027
+ question: request.question,
2032
2028
  guidelines: request.guidelines
2033
2029
  }
2034
2030
  };
@@ -2421,23 +2417,25 @@ function resolveOptionalString(source, env, description, options) {
2421
2417
  if (trimmed.length === 0) {
2422
2418
  return void 0;
2423
2419
  }
2424
- const envValue = env[trimmed];
2425
- if (envValue !== void 0) {
2426
- if (envValue.trim().length === 0) {
2427
- throw new Error(`Environment variable '${trimmed}' for ${description} is empty`);
2420
+ const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
2421
+ if (envVarMatch) {
2422
+ const varName = envVarMatch[1];
2423
+ const envValue = env[varName];
2424
+ if (envValue !== void 0) {
2425
+ if (envValue.trim().length === 0) {
2426
+ throw new Error(`Environment variable '${varName}' for ${description} is empty`);
2427
+ }
2428
+ return envValue;
2428
2429
  }
2429
- return envValue;
2430
- }
2431
- const allowLiteral = options?.allowLiteral ?? false;
2432
- const optionalEnv = options?.optionalEnv ?? false;
2433
- const looksLikeEnv = isLikelyEnvReference(trimmed);
2434
- if (looksLikeEnv) {
2430
+ const optionalEnv = options?.optionalEnv ?? false;
2435
2431
  if (optionalEnv) {
2436
2432
  return void 0;
2437
2433
  }
2438
- if (!allowLiteral) {
2439
- throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
2440
- }
2434
+ throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
2435
+ }
2436
+ const allowLiteral = options?.allowLiteral ?? false;
2437
+ if (!allowLiteral) {
2438
+ throw new Error(`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`);
2441
2439
  }
2442
2440
  return trimmed;
2443
2441
  }
@@ -2484,9 +2482,6 @@ function resolveOptionalBoolean(source) {
2484
2482
  }
2485
2483
  throw new Error("expected boolean value");
2486
2484
  }
2487
- function isLikelyEnvReference(value) {
2488
- return /^[A-Z0-9_]+$/.test(value);
2489
- }
2490
2485
  function resolveOptionalStringArray(source, env, description) {
2491
2486
  if (source === void 0 || source === null) {
2492
2487
  return void 0;
@@ -2507,21 +2502,25 @@ function resolveOptionalStringArray(source, env, description) {
2507
2502
  if (trimmed.length === 0) {
2508
2503
  throw new Error(`${description}[${i}] cannot be empty`);
2509
2504
  }
2510
- const envValue = env[trimmed];
2511
- if (envValue !== void 0) {
2512
- if (envValue.trim().length === 0) {
2513
- throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
2505
+ const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
2506
+ if (envVarMatch) {
2507
+ const varName = envVarMatch[1];
2508
+ const envValue = env[varName];
2509
+ if (envValue !== void 0) {
2510
+ if (envValue.trim().length === 0) {
2511
+ throw new Error(`Environment variable '${varName}' for ${description}[${i}] is empty`);
2512
+ }
2513
+ resolved.push(envValue);
2514
+ continue;
2514
2515
  }
2515
- resolved.push(envValue);
2516
- } else {
2517
- resolved.push(trimmed);
2516
+ throw new Error(`Environment variable '${varName}' for ${description}[${i}] is not set`);
2518
2517
  }
2518
+ resolved.push(trimmed);
2519
2519
  }
2520
2520
  return resolved.length > 0 ? resolved : void 0;
2521
2521
  }
2522
2522
 
2523
2523
  // src/evaluation/providers/vscode.ts
2524
- var import_promises4 = require("fs/promises");
2525
2524
  var import_node_path6 = __toESM(require("path"), 1);
2526
2525
  var import_subagent = require("subagent");
2527
2526
  var VSCodeProvider = class {
@@ -2565,7 +2564,7 @@ var VSCodeProvider = class {
2565
2564
  }
2566
2565
  };
2567
2566
  }
2568
- const responseText = await (0, import_promises4.readFile)(session.responseFile, "utf8");
2567
+ const responseText = await readTextFile(session.responseFile);
2569
2568
  return {
2570
2569
  text: responseText,
2571
2570
  raw: {
@@ -2619,7 +2618,7 @@ var VSCodeProvider = class {
2619
2618
  }
2620
2619
  const responses = [];
2621
2620
  for (const [index, responseFile] of session.responseFiles.entries()) {
2622
- const responseText = await (0, import_promises4.readFile)(responseFile, "utf8");
2621
+ const responseText = await readTextFile(responseFile);
2623
2622
  responses.push({
2624
2623
  text: responseText,
2625
2624
  raw: {
@@ -2644,7 +2643,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
2644
2643
  if (prereadBlock.length > 0) {
2645
2644
  parts.push("\n", prereadBlock);
2646
2645
  }
2647
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
2646
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2648
2647
  return parts.join("\n").trim();
2649
2648
  }
2650
2649
  function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
@@ -2769,12 +2768,20 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
2769
2768
 
2770
2769
  // src/evaluation/providers/targets-file.ts
2771
2770
  var import_node_fs4 = require("fs");
2772
- var import_promises5 = require("fs/promises");
2771
+ var import_promises4 = require("fs/promises");
2773
2772
  var import_node_path7 = __toESM(require("path"), 1);
2774
2773
  var import_yaml2 = require("yaml");
2775
2774
 
2776
2775
  // src/evaluation/providers/types.ts
2777
- var TARGETS_SCHEMA_V2 = "agentv-targets-v2";
2776
+ var AGENT_PROVIDER_KINDS = [
2777
+ "codex",
2778
+ "vscode",
2779
+ "vscode-insiders"
2780
+ ];
2781
+ var TARGETS_SCHEMA_V2 = "agentv-targets-v2.1";
2782
+ function isAgentProvider(provider) {
2783
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
2784
+ }
2778
2785
 
2779
2786
  // src/evaluation/providers/targets-file.ts
2780
2787
  function isRecord(value) {
@@ -2831,7 +2838,7 @@ function assertTargetDefinition(value, index, filePath) {
2831
2838
  }
2832
2839
  async function fileExists3(filePath) {
2833
2840
  try {
2834
- await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
2841
+ await (0, import_promises4.access)(filePath, import_node_fs4.constants.F_OK);
2835
2842
  return true;
2836
2843
  } catch {
2837
2844
  return false;
@@ -2842,7 +2849,7 @@ async function readTargetDefinitions(filePath) {
2842
2849
  if (!await fileExists3(absolutePath)) {
2843
2850
  throw new Error(`targets.yaml not found at ${absolutePath}`);
2844
2851
  }
2845
- const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
2852
+ const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
2846
2853
  const parsed = (0, import_yaml2.parse)(raw);
2847
2854
  if (!isRecord(parsed)) {
2848
2855
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -2886,30 +2893,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
2886
2893
  }
2887
2894
 
2888
2895
  // src/evaluation/evaluators.ts
2889
- var import_ax3 = require("@ax-llm/ax");
2890
2896
  var import_node_crypto2 = require("crypto");
2891
- var LLM_JUDGE_SIGNATURE = (0, import_ax3.f)().input(
2892
- "evaluationContext",
2893
- import_ax3.f.object(
2894
- {
2895
- expectedOutcome: import_ax3.f.string("The expected outcome for the original task"),
2896
- request: import_ax3.f.string("The original task request"),
2897
- referenceAnswer: import_ax3.f.string("The gold standard reference answer"),
2898
- generatedAnswer: import_ax3.f.string("The answer to evaluate"),
2899
- guidelines: import_ax3.f.string("Additional evaluation guidelines or instructions").optional()
2900
- },
2901
- "Complete evaluation context for the judge"
2902
- )
2903
- ).output(
2904
- "evaluation",
2905
- import_ax3.f.object({
2906
- score: import_ax3.f.number("Score between 0.0 and 1.0").min(0).max(1),
2907
- hits: import_ax3.f.string("Brief specific achievement").array(),
2908
- misses: import_ax3.f.string("Brief specific failure or omission").array(),
2909
- reasoning: import_ax3.f.string("Concise explanation for the score").max(500)
2910
- })
2911
- ).build();
2912
- var LLM_JUDGE = (0, import_ax3.ax)(LLM_JUDGE_SIGNATURE);
2913
2897
  var LlmJudgeEvaluator = class {
2914
2898
  kind = "llm_judge";
2915
2899
  resolveJudgeProvider;
@@ -2927,52 +2911,29 @@ var LlmJudgeEvaluator = class {
2927
2911
  if (!judgeProvider) {
2928
2912
  throw new Error("No judge provider available for LLM grading");
2929
2913
  }
2930
- if (providerSupportsAx(judgeProvider)) {
2931
- return this.evaluateWithAx(context, judgeProvider);
2932
- }
2933
2914
  return this.evaluateWithPrompt(context, judgeProvider);
2934
2915
  }
2935
- async evaluateWithAx(context, judgeProvider) {
2936
- const ai = judgeProvider.getAxAI();
2937
- const guidelines = context.promptInputs.guidelines?.trim();
2938
- const evaluationContext = {
2939
- expectedOutcome: context.evalCase.outcome.trim(),
2940
- request: context.evalCase.task.trim(),
2941
- referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
2942
- generatedAnswer: context.candidate.trim(),
2943
- ...guidelines ? { guidelines } : {}
2944
- };
2945
- const options = this.buildJudgeForwardOptions(context);
2946
- const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
2947
- const evaluation = result.evaluation;
2948
- const expectedAspectCount = Math.max(
2949
- evaluation.hits.length + evaluation.misses.length,
2950
- 1
2951
- );
2952
- return {
2953
- score: evaluation.score,
2954
- hits: evaluation.hits,
2955
- misses: evaluation.misses,
2956
- expectedAspectCount,
2957
- reasoning: evaluation.reasoning,
2958
- evaluatorRawRequest: {
2959
- id: (0, import_node_crypto2.randomUUID)(),
2960
- provider: judgeProvider.id,
2961
- target: context.target.name,
2962
- method: "ax-structured-output",
2963
- signature: LLM_JUDGE_SIGNATURE.toString()
2964
- }
2965
- };
2966
- }
2967
2916
  async evaluateWithPrompt(context, judgeProvider) {
2968
- const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2969
- const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2917
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2918
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2919
+ if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2920
+ const variables = {
2921
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2922
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2923
+ candidate_answer: context.candidate,
2924
+ reference_answer: context.evalCase.reference_answer,
2925
+ expected_outcome: context.evalCase.expected_outcome,
2926
+ question: context.evalCase.question
2927
+ };
2928
+ prompt = substituteVariables(systemPrompt, variables);
2929
+ systemPrompt = QUALITY_SYSTEM_PROMPT;
2930
+ }
2970
2931
  const metadata = {
2971
2932
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
2972
2933
  ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2973
2934
  };
2974
2935
  const response = await judgeProvider.invoke({
2975
- prompt,
2936
+ question: prompt,
2976
2937
  metadata,
2977
2938
  evalCaseId: context.evalCase.id,
2978
2939
  attempt: context.attempt,
@@ -3002,33 +2963,11 @@ var LlmJudgeEvaluator = class {
3002
2963
  evaluatorRawRequest
3003
2964
  };
3004
2965
  }
3005
- buildJudgeForwardOptions(context) {
3006
- const modelConfig = this.buildJudgeModelConfig();
3007
- if (modelConfig === void 0 && context.judgeModel === void 0) {
3008
- return void 0;
3009
- }
3010
- return {
3011
- ...context.judgeModel ? { model: context.judgeModel } : {},
3012
- ...modelConfig ? { modelConfig } : {}
3013
- };
3014
- }
3015
- buildJudgeModelConfig() {
3016
- if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
3017
- return void 0;
3018
- }
3019
- return {
3020
- ...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
3021
- ...this.temperature !== void 0 ? { temperature: this.temperature } : {}
3022
- };
3023
- }
3024
2966
  };
3025
- function providerSupportsAx(provider) {
3026
- return typeof provider.getAxAI === "function";
3027
- }
3028
2967
  var QUALITY_SYSTEM_PROMPT = [
3029
- "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
2968
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3030
2969
  "",
3031
- "Use the reference_answer as a gold standard for a high-quality response. The generated_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2970
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
3032
2971
  "",
3033
2972
  "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3034
2973
  "",
@@ -3041,18 +2980,18 @@ var QUALITY_SYSTEM_PROMPT = [
3041
2980
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3042
2981
  "}"
3043
2982
  ].join("\n");
3044
- function buildQualityPrompt(testCase, candidate) {
2983
+ function buildQualityPrompt(evalCase, candidate) {
3045
2984
  const parts = [
3046
2985
  "[[ ## expected_outcome ## ]]",
3047
- testCase.outcome.trim(),
2986
+ evalCase.expected_outcome.trim(),
3048
2987
  "",
3049
- "[[ ## request ## ]]",
3050
- testCase.task.trim(),
2988
+ "[[ ## question ## ]]",
2989
+ evalCase.question.trim(),
3051
2990
  "",
3052
2991
  "[[ ## reference_answer ## ]]",
3053
- testCase.expected_assistant_raw.trim(),
2992
+ evalCase.reference_answer.trim(),
3054
2993
  "",
3055
- "[[ ## generated_answer ## ]]",
2994
+ "[[ ## candidate_answer ## ]]",
3056
2995
  candidate.trim(),
3057
2996
  "",
3058
2997
  "Respond with a single JSON object matching the schema described in the system prompt."
@@ -3152,14 +3091,14 @@ var CodeEvaluator = class {
3152
3091
  async evaluate(context) {
3153
3092
  const inputPayload = JSON.stringify(
3154
3093
  {
3155
- task: context.evalCase.task,
3156
- outcome: context.evalCase.outcome,
3157
- expected: context.evalCase.expected_assistant_raw,
3158
- output: context.candidate,
3094
+ question: context.evalCase.question,
3095
+ expected_outcome: context.evalCase.expected_outcome,
3096
+ reference_answer: context.evalCase.reference_answer,
3097
+ candidate_answer: context.candidate,
3159
3098
  system_message: context.promptInputs.systemMessage ?? "",
3160
3099
  guideline_paths: context.evalCase.guideline_paths,
3161
- attachments: context.evalCase.file_paths,
3162
- user_segments: context.evalCase.user_segments
3100
+ input_files: context.evalCase.file_paths,
3101
+ input_segments: context.evalCase.input_segments
3163
3102
  },
3164
3103
  null,
3165
3104
  2
@@ -3245,10 +3184,18 @@ function parseJsonSafe(payload) {
3245
3184
  return void 0;
3246
3185
  }
3247
3186
  }
3187
+ function hasTemplateVariables(text) {
3188
+ return /\$\{[a-zA-Z0-9_]+\}/.test(text);
3189
+ }
3190
+ function substituteVariables(template, variables) {
3191
+ return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3192
+ return variables[varName] ?? match;
3193
+ });
3194
+ }
3248
3195
 
3249
3196
  // src/evaluation/orchestrator.ts
3250
3197
  var import_node_crypto3 = require("crypto");
3251
- var import_promises6 = require("fs/promises");
3198
+ var import_promises5 = require("fs/promises");
3252
3199
  var import_node_path8 = __toESM(require("path"), 1);
3253
3200
 
3254
3201
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
@@ -3567,7 +3514,8 @@ async function runEvaluation(options) {
3567
3514
  target.name,
3568
3515
  (now ?? (() => /* @__PURE__ */ new Date()))(),
3569
3516
  outcome.reason,
3570
- promptInputs
3517
+ promptInputs,
3518
+ primaryProvider
3571
3519
  );
3572
3520
  results.push(errorResult);
3573
3521
  if (onResult) {
@@ -3601,7 +3549,7 @@ async function runBatchEvaluation(options) {
3601
3549
  const batchRequests = evalCases.map((evalCase, index) => {
3602
3550
  const promptInputs = promptInputsList[index];
3603
3551
  return {
3604
- prompt: promptInputs.request,
3552
+ question: promptInputs.question,
3605
3553
  guidelines: promptInputs.guidelines,
3606
3554
  guideline_patterns: evalCase.guideline_patterns,
3607
3555
  inputFiles: evalCase.file_paths,
@@ -3651,7 +3599,7 @@ async function runBatchEvaluation(options) {
3651
3599
  agentTimeoutMs
3652
3600
  });
3653
3601
  } catch (error) {
3654
- const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3602
+ const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3655
3603
  results.push(errorResult);
3656
3604
  if (onResult) {
3657
3605
  await onResult(errorResult);
@@ -3728,7 +3676,7 @@ async function runEvalCase(options) {
3728
3676
  attempt += 1;
3729
3677
  continue;
3730
3678
  }
3731
- return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3679
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3732
3680
  }
3733
3681
  }
3734
3682
  if (!providerResponse) {
@@ -3737,7 +3685,8 @@ async function runEvalCase(options) {
3737
3685
  target.name,
3738
3686
  nowFn(),
3739
3687
  lastError ?? new Error("Provider did not return a response"),
3740
- promptInputs
3688
+ promptInputs,
3689
+ provider
3741
3690
  );
3742
3691
  }
3743
3692
  if (cacheKey && cache && !cachedResponse) {
@@ -3757,7 +3706,7 @@ async function runEvalCase(options) {
3757
3706
  agentTimeoutMs
3758
3707
  });
3759
3708
  } catch (error) {
3760
- return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3709
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3761
3710
  }
3762
3711
  }
3763
3712
  async function evaluateCandidate(options) {
@@ -3788,8 +3737,8 @@ async function evaluateCandidate(options) {
3788
3737
  });
3789
3738
  const completedAt = nowFn();
3790
3739
  const rawRequest = {
3791
- request: promptInputs.request,
3792
- guidelines: promptInputs.guidelines,
3740
+ question: promptInputs.question,
3741
+ ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3793
3742
  guideline_paths: evalCase.guideline_paths,
3794
3743
  system_message: promptInputs.systemMessage ?? ""
3795
3744
  };
@@ -3800,7 +3749,7 @@ async function evaluateCandidate(options) {
3800
3749
  score: score.score,
3801
3750
  hits: score.hits,
3802
3751
  misses: score.misses,
3803
- model_answer: candidate,
3752
+ candidate_answer: candidate,
3804
3753
  expected_aspect_count: score.expectedAspectCount,
3805
3754
  target: target.name,
3806
3755
  timestamp: completedAt.toISOString(),
@@ -4007,14 +3956,14 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
4007
3956
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
4008
3957
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
4009
3958
  const filePath = import_node_path8.default.resolve(directory, filename);
4010
- await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
3959
+ await (0, import_promises5.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
4011
3960
  const payload = {
4012
3961
  eval_id: evalCase.id,
4013
- request: promptInputs.request,
3962
+ question: promptInputs.question,
4014
3963
  guidelines: promptInputs.guidelines,
4015
3964
  guideline_paths: evalCase.guideline_paths
4016
3965
  };
4017
- await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
3966
+ await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
4018
3967
  }
4019
3968
  function sanitizeFilename(value) {
4020
3969
  if (!value) {
@@ -4032,7 +3981,7 @@ async function invokeProvider(provider, options) {
4032
3981
  }
4033
3982
  try {
4034
3983
  return await provider.invoke({
4035
- prompt: promptInputs.request,
3984
+ question: promptInputs.question,
4036
3985
  guidelines: promptInputs.guidelines,
4037
3986
  guideline_patterns: evalCase.guideline_patterns,
4038
3987
  inputFiles: evalCase.file_paths,
@@ -4049,11 +3998,11 @@ async function invokeProvider(provider, options) {
4049
3998
  }
4050
3999
  }
4051
4000
  }
4052
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
4001
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
4053
4002
  const message = error instanceof Error ? error.message : String(error);
4054
4003
  const rawRequest = {
4055
- request: promptInputs.request,
4056
- guidelines: promptInputs.guidelines,
4004
+ question: promptInputs.question,
4005
+ ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
4057
4006
  guideline_paths: evalCase.guideline_paths,
4058
4007
  system_message: promptInputs.systemMessage ?? "",
4059
4008
  error: message
@@ -4065,7 +4014,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
4065
4014
  score: 0,
4066
4015
  hits: [],
4067
4016
  misses: [`Error: ${message}`],
4068
- model_answer: `Error occurred: ${message}`,
4017
+ candidate_answer: `Error occurred: ${message}`,
4069
4018
  expected_aspect_count: 0,
4070
4019
  target: targetName,
4071
4020
  timestamp: timestamp.toISOString(),
@@ -4078,7 +4027,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
4078
4027
  hash.update(provider.id);
4079
4028
  hash.update(target.name);
4080
4029
  hash.update(evalCase.id);
4081
- hash.update(promptInputs.request);
4030
+ hash.update(promptInputs.question);
4082
4031
  hash.update(promptInputs.guidelines);
4083
4032
  hash.update(promptInputs.systemMessage ?? "");
4084
4033
  return hash.digest("hex");