@agentv/core 0.6.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4,9 +4,10 @@ import {
4
4
  buildSearchRoots,
5
5
  fileExists,
6
6
  findGitRoot,
7
+ isAgentProvider,
7
8
  readTextFile,
8
9
  resolveFileReference
9
- } from "./chunk-OW3SHBIJ.js";
10
+ } from "./chunk-UQLHF3T7.js";
10
11
 
11
12
  // src/evaluation/types.ts
12
13
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -134,6 +135,87 @@ function extractCodeBlocks(segments) {
134
135
  }
135
136
  return codeBlocks;
136
137
  }
138
+ async function processMessages(options) {
139
+ const {
140
+ messages,
141
+ searchRoots,
142
+ repoRootPath,
143
+ guidelinePatterns,
144
+ guidelinePaths,
145
+ textParts,
146
+ messageType,
147
+ verbose
148
+ } = options;
149
+ const segments = [];
150
+ for (const message of messages) {
151
+ const content = message.content;
152
+ if (typeof content === "string") {
153
+ segments.push({ type: "text", value: content });
154
+ if (textParts) {
155
+ textParts.push(content);
156
+ }
157
+ continue;
158
+ }
159
+ for (const rawSegment of content) {
160
+ if (!isJsonObject(rawSegment)) {
161
+ continue;
162
+ }
163
+ const segmentType = asString(rawSegment.type);
164
+ if (segmentType === "file") {
165
+ const rawValue = asString(rawSegment.value);
166
+ if (!rawValue) {
167
+ continue;
168
+ }
169
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
170
+ rawValue,
171
+ searchRoots
172
+ );
173
+ if (!resolvedPath) {
174
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
175
+ const context = messageType === "input" ? "" : " in expected_messages";
176
+ logWarning(`File not found${context}: ${displayPath}`, attempts);
177
+ continue;
178
+ }
179
+ try {
180
+ const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
181
+ if (messageType === "input" && guidelinePatterns && guidelinePaths) {
182
+ const relativeToRepo = path.relative(repoRootPath, resolvedPath);
183
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
184
+ guidelinePaths.push(path.resolve(resolvedPath));
185
+ if (verbose) {
186
+ console.log(` [Guideline] Found: ${displayPath}`);
187
+ console.log(` Resolved to: ${resolvedPath}`);
188
+ }
189
+ continue;
190
+ }
191
+ }
192
+ segments.push({
193
+ type: "file",
194
+ path: displayPath,
195
+ text: fileContent,
196
+ resolvedPath: path.resolve(resolvedPath)
197
+ });
198
+ if (verbose) {
199
+ const label = messageType === "input" ? "[File]" : "[Expected Output File]";
200
+ console.log(` ${label} Found: ${displayPath}`);
201
+ console.log(` Resolved to: ${resolvedPath}`);
202
+ }
203
+ } catch (error) {
204
+ const context = messageType === "input" ? "" : " expected output";
205
+ logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
206
+ }
207
+ continue;
208
+ }
209
+ const clonedSegment = cloneJsonObject(rawSegment);
210
+ segments.push(clonedSegment);
211
+ const inlineValue = clonedSegment.value;
212
+ if (typeof inlineValue === "string" && textParts) {
213
+ textParts.push(inlineValue);
214
+ }
215
+ }
216
+ }
217
+ return segments;
218
+ }
137
219
  async function loadEvalCases(evalFilePath, repoRoot, options) {
138
220
  const verbose = options?.verbose ?? false;
139
221
  const absoluteTestPath = path.resolve(evalFilePath);
@@ -219,77 +301,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
219
301
  }
220
302
  }
221
303
  }
222
- const userSegments = [];
223
304
  const guidelinePaths = [];
224
- const userTextParts = [];
225
- for (const userMessage of userMessages) {
226
- const content = userMessage.content;
227
- if (typeof content === "string") {
228
- userSegments.push({ type: "text", value: content });
229
- userTextParts.push(content);
230
- continue;
231
- }
232
- for (const rawSegment of content) {
233
- if (!isJsonObject(rawSegment)) {
234
- continue;
235
- }
236
- const segmentType = asString(rawSegment.type);
237
- if (segmentType === "file") {
238
- const rawValue = asString(rawSegment.value);
239
- if (!rawValue) {
240
- continue;
241
- }
242
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
243
- rawValue,
244
- searchRoots
245
- );
246
- if (!resolvedPath) {
247
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
248
- logWarning(`File not found: ${displayPath}`, attempts);
249
- continue;
250
- }
251
- try {
252
- const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
253
- const relativeToRepo = path.relative(repoRootPath, resolvedPath);
254
- if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
255
- guidelinePaths.push(path.resolve(resolvedPath));
256
- if (verbose) {
257
- console.log(` [Guideline] Found: ${displayPath}`);
258
- console.log(` Resolved to: ${resolvedPath}`);
259
- }
260
- } else {
261
- userSegments.push({
262
- type: "file",
263
- path: displayPath,
264
- text: fileContent,
265
- resolvedPath: path.resolve(resolvedPath)
266
- });
267
- if (verbose) {
268
- console.log(` [File] Found: ${displayPath}`);
269
- console.log(` Resolved to: ${resolvedPath}`);
270
- }
271
- }
272
- } catch (error) {
273
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
274
- }
275
- continue;
276
- }
277
- const clonedSegment = cloneJsonObject(rawSegment);
278
- userSegments.push(clonedSegment);
279
- const inlineValue = clonedSegment.value;
280
- if (typeof inlineValue === "string") {
281
- userTextParts.push(inlineValue);
282
- }
283
- }
284
- }
285
- const codeSnippets = extractCodeBlocks(userSegments);
305
+ const inputTextParts = [];
306
+ const inputSegments = await processMessages({
307
+ messages: userMessages,
308
+ searchRoots,
309
+ repoRootPath,
310
+ guidelinePatterns,
311
+ guidelinePaths,
312
+ textParts: inputTextParts,
313
+ messageType: "input",
314
+ verbose
315
+ });
316
+ const outputSegments = await processMessages({
317
+ messages: assistantMessages,
318
+ searchRoots,
319
+ repoRootPath,
320
+ guidelinePatterns,
321
+ messageType: "output",
322
+ verbose
323
+ });
324
+ const codeSnippets = extractCodeBlocks(inputSegments);
286
325
  const assistantContent = assistantMessages[0]?.content;
287
- const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
288
- const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
326
+ const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
327
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
289
328
  const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
290
329
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
291
330
  const userFilePaths = [];
292
- for (const segment of userSegments) {
331
+ for (const segment of inputSegments) {
293
332
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
294
333
  userFilePaths.push(segment.resolvedPath);
295
334
  }
@@ -302,15 +341,16 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
302
341
  id,
303
342
  dataset: datasetName,
304
343
  conversation_id: conversationId,
305
- task: userTextPrompt,
306
- user_segments: userSegments,
344
+ question,
345
+ input_segments: inputSegments,
346
+ output_segments: outputSegments,
307
347
  system_message: systemMessageContent,
308
- expected_assistant_raw: expectedAssistantRaw,
348
+ reference_answer: referenceAnswer,
309
349
  guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
310
350
  guideline_patterns: guidelinePatterns,
311
351
  file_paths: allFilePaths,
312
352
  code_snippets: codeSnippets,
313
- outcome,
353
+ expected_outcome: outcome,
314
354
  evaluator: testCaseEvaluatorKind,
315
355
  evaluators
316
356
  };
@@ -346,36 +386,36 @@ ${content}`);
346
386
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
347
387
  }
348
388
  }
349
- const requestParts = [];
350
- for (const segment of testCase.user_segments) {
389
+ const questionParts = [];
390
+ for (const segment of testCase.input_segments) {
351
391
  const typeValue = segment.type;
352
392
  if (typeof typeValue === "string" && typeValue === "file") {
353
393
  const pathValue = segment.path;
354
394
  const textValue = segment.text;
355
395
  const label = typeof pathValue === "string" ? pathValue : "file";
356
396
  const body = typeof textValue === "string" ? textValue : "";
357
- requestParts.push(`=== ${label} ===
397
+ questionParts.push(`=== ${label} ===
358
398
  ${body}`);
359
399
  continue;
360
400
  }
361
401
  if (typeof typeValue === "string" && typeValue === "text") {
362
402
  const value = segment.value;
363
403
  if (typeof value === "string") {
364
- requestParts.push(value);
404
+ questionParts.push(value);
365
405
  }
366
406
  continue;
367
407
  }
368
408
  const genericValue = segment.value;
369
409
  if (typeof genericValue === "string") {
370
- requestParts.push(genericValue);
410
+ questionParts.push(genericValue);
371
411
  }
372
412
  }
373
413
  if (testCase.code_snippets.length > 0) {
374
- requestParts.push(testCase.code_snippets.join("\n"));
414
+ questionParts.push(testCase.code_snippets.join("\n"));
375
415
  }
376
- const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
416
+ const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
377
417
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
378
- return { request, guidelines, systemMessage: testCase.system_message };
418
+ return { question, guidelines, systemMessage: testCase.system_message };
379
419
  }
380
420
  async function fileExists2(absolutePath) {
381
421
  try {
@@ -587,7 +627,7 @@ function buildChatPrompt(request) {
587
627
  ${request.guidelines.trim()}`);
588
628
  }
589
629
  const systemContent = systemSegments.join("\n\n");
590
- const userContent = request.prompt.trim();
630
+ const userContent = request.question.trim();
591
631
  const prompt = [
592
632
  {
593
633
  role: "system",
@@ -885,7 +925,7 @@ var CliProvider = class {
885
925
  healthcheck.commandTemplate,
886
926
  buildTemplateValues(
887
927
  {
888
- prompt: "",
928
+ question: "",
889
929
  guidelines: "",
890
930
  inputFiles: [],
891
931
  evalCaseId: "",
@@ -912,7 +952,7 @@ var CliProvider = class {
912
952
  function buildTemplateValues(request, config) {
913
953
  const inputFiles = normalizeInputFiles(request.inputFiles);
914
954
  return {
915
- PROMPT: shellEscape(request.prompt ?? ""),
955
+ PROMPT: shellEscape(request.question ?? ""),
916
956
  GUIDELINES: shellEscape(request.guidelines ?? ""),
917
957
  EVAL_ID: shellEscape(request.evalCaseId ?? ""),
918
958
  ATTEMPT: shellEscape(String(request.attempt ?? 0)),
@@ -971,11 +1011,64 @@ function formatTimeoutSuffix(timeoutMs) {
971
1011
  import { exec as execCallback, spawn } from "node:child_process";
972
1012
  import { randomUUID } from "node:crypto";
973
1013
  import { constants as constants2, createWriteStream } from "node:fs";
974
- import { access as access2, copyFile, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
1014
+ import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
975
1015
  import { tmpdir } from "node:os";
976
1016
  import path4 from "node:path";
977
1017
  import { promisify as promisify2 } from "node:util";
978
1018
 
1019
+ // src/evaluation/providers/codex-log-tracker.ts
1020
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1021
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1022
+ function getCodexLogStore() {
1023
+ const globalObject = globalThis;
1024
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1025
+ if (existing) {
1026
+ return existing;
1027
+ }
1028
+ const created = [];
1029
+ globalObject[GLOBAL_LOGS_KEY] = created;
1030
+ return created;
1031
+ }
1032
+ function getSubscriberStore() {
1033
+ const globalObject = globalThis;
1034
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1035
+ if (existing) {
1036
+ return existing;
1037
+ }
1038
+ const created = /* @__PURE__ */ new Set();
1039
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1040
+ return created;
1041
+ }
1042
+ function notifySubscribers(entry) {
1043
+ const subscribers = Array.from(getSubscriberStore());
1044
+ for (const listener of subscribers) {
1045
+ try {
1046
+ listener(entry);
1047
+ } catch (error) {
1048
+ const message = error instanceof Error ? error.message : String(error);
1049
+ console.warn(`Codex log subscriber failed: ${message}`);
1050
+ }
1051
+ }
1052
+ }
1053
+ function recordCodexLogEntry(entry) {
1054
+ getCodexLogStore().push(entry);
1055
+ notifySubscribers(entry);
1056
+ }
1057
+ function consumeCodexLogEntries() {
1058
+ const store = getCodexLogStore();
1059
+ if (store.length === 0) {
1060
+ return [];
1061
+ }
1062
+ return store.splice(0, store.length);
1063
+ }
1064
+ function subscribeToCodexLogEntries(listener) {
1065
+ const store = getSubscriberStore();
1066
+ store.add(listener);
1067
+ return () => {
1068
+ store.delete(listener);
1069
+ };
1070
+ }
1071
+
979
1072
  // src/evaluation/providers/preread.ts
980
1073
  import path3 from "node:path";
981
1074
  function buildPromptDocument(request, inputFiles, options) {
@@ -993,7 +1086,7 @@ function buildPromptDocument(request, inputFiles, options) {
993
1086
  if (prereadBlock.length > 0) {
994
1087
  parts.push("\n", prereadBlock);
995
1088
  }
996
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1089
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
997
1090
  return parts.join("\n").trim();
998
1091
  }
999
1092
  function normalizeInputFiles2(inputFiles) {
@@ -1077,64 +1170,10 @@ function pathToFileUri(filePath) {
1077
1170
  return `file://${normalizedPath}`;
1078
1171
  }
1079
1172
 
1080
- // src/evaluation/providers/codex-log-tracker.ts
1081
- var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1082
- var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1083
- function getCodexLogStore() {
1084
- const globalObject = globalThis;
1085
- const existing = globalObject[GLOBAL_LOGS_KEY];
1086
- if (existing) {
1087
- return existing;
1088
- }
1089
- const created = [];
1090
- globalObject[GLOBAL_LOGS_KEY] = created;
1091
- return created;
1092
- }
1093
- function getSubscriberStore() {
1094
- const globalObject = globalThis;
1095
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1096
- if (existing) {
1097
- return existing;
1098
- }
1099
- const created = /* @__PURE__ */ new Set();
1100
- globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1101
- return created;
1102
- }
1103
- function notifySubscribers(entry) {
1104
- const subscribers = Array.from(getSubscriberStore());
1105
- for (const listener of subscribers) {
1106
- try {
1107
- listener(entry);
1108
- } catch (error) {
1109
- const message = error instanceof Error ? error.message : String(error);
1110
- console.warn(`Codex log subscriber failed: ${message}`);
1111
- }
1112
- }
1113
- }
1114
- function recordCodexLogEntry(entry) {
1115
- getCodexLogStore().push(entry);
1116
- notifySubscribers(entry);
1117
- }
1118
- function consumeCodexLogEntries() {
1119
- const store = getCodexLogStore();
1120
- if (store.length === 0) {
1121
- return [];
1122
- }
1123
- return store.splice(0, store.length);
1124
- }
1125
- function subscribeToCodexLogEntries(listener) {
1126
- const store = getSubscriberStore();
1127
- store.add(listener);
1128
- return () => {
1129
- store.delete(listener);
1130
- };
1131
- }
1132
-
1133
1173
  // src/evaluation/providers/codex.ts
1134
1174
  var execAsync2 = promisify2(execCallback);
1135
1175
  var WORKSPACE_PREFIX = "agentv-codex-";
1136
1176
  var PROMPT_FILENAME = "prompt.md";
1137
- var FILES_DIR = "files";
1138
1177
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
1139
1178
  var CodexProvider = class {
1140
1179
  id;
@@ -1157,21 +1196,10 @@ var CodexProvider = class {
1157
1196
  }
1158
1197
  await this.ensureEnvironmentReady();
1159
1198
  const inputFiles = normalizeInputFiles2(request.inputFiles);
1160
- const originalGuidelines = new Set(
1161
- collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => path4.resolve(file))
1162
- );
1163
1199
  const workspaceRoot = await this.createWorkspace();
1164
1200
  const logger = await this.createStreamLogger(request).catch(() => void 0);
1165
1201
  try {
1166
- const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
1167
- inputFiles,
1168
- workspaceRoot,
1169
- originalGuidelines
1170
- );
1171
- const promptContent = buildPromptDocument(request, mirroredInputFiles, {
1172
- guidelinePatterns: request.guideline_patterns,
1173
- guidelineOverrides: guidelineMirrors
1174
- });
1202
+ const promptContent = buildPromptDocument(request, inputFiles);
1175
1203
  const promptFile = path4.join(workspaceRoot, PROMPT_FILENAME);
1176
1204
  await writeFile(promptFile, promptContent, "utf8");
1177
1205
  const args = this.buildCodexArgs();
@@ -1200,7 +1228,7 @@ var CodexProvider = class {
1200
1228
  executable: this.resolvedExecutable ?? this.config.executable,
1201
1229
  promptFile,
1202
1230
  workspace: workspaceRoot,
1203
- inputFiles: mirroredInputFiles,
1231
+ inputFiles,
1204
1232
  logFile: logger?.filePath
1205
1233
  }
1206
1234
  };
@@ -1255,37 +1283,6 @@ var CodexProvider = class {
1255
1283
  throw error;
1256
1284
  }
1257
1285
  }
1258
- async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
1259
- if (!inputFiles || inputFiles.length === 0) {
1260
- return {
1261
- mirroredInputFiles: void 0,
1262
- guidelineMirrors: /* @__PURE__ */ new Set()
1263
- };
1264
- }
1265
- const filesRoot = path4.join(workspaceRoot, FILES_DIR);
1266
- await mkdir(filesRoot, { recursive: true });
1267
- const mirrored = [];
1268
- const guidelineMirrors = /* @__PURE__ */ new Set();
1269
- const nameCounts = /* @__PURE__ */ new Map();
1270
- for (const inputFile of inputFiles) {
1271
- const absoluteSource = path4.resolve(inputFile);
1272
- const baseName = path4.basename(absoluteSource);
1273
- const count = nameCounts.get(baseName) ?? 0;
1274
- nameCounts.set(baseName, count + 1);
1275
- const finalName = count === 0 ? baseName : `${baseName}.${count}`;
1276
- const destination = path4.join(filesRoot, finalName);
1277
- await copyFile(absoluteSource, destination);
1278
- const resolvedDestination = path4.resolve(destination);
1279
- mirrored.push(resolvedDestination);
1280
- if (guidelineOriginals.has(absoluteSource)) {
1281
- guidelineMirrors.add(resolvedDestination);
1282
- }
1283
- }
1284
- return {
1285
- mirroredInputFiles: mirrored,
1286
- guidelineMirrors
1287
- };
1288
- }
1289
1286
  async createWorkspace() {
1290
1287
  return await mkdtemp(path4.join(tmpdir(), WORKSPACE_PREFIX));
1291
1288
  }
@@ -1863,7 +1860,7 @@ var MockProvider = class {
1863
1860
  return {
1864
1861
  text: this.cannedResponse,
1865
1862
  raw: {
1866
- prompt: request.prompt,
1863
+ question: request.question,
1867
1864
  guidelines: request.guidelines
1868
1865
  }
1869
1866
  };
@@ -2256,23 +2253,25 @@ function resolveOptionalString(source, env, description, options) {
2256
2253
  if (trimmed.length === 0) {
2257
2254
  return void 0;
2258
2255
  }
2259
- const envValue = env[trimmed];
2260
- if (envValue !== void 0) {
2261
- if (envValue.trim().length === 0) {
2262
- throw new Error(`Environment variable '${trimmed}' for ${description} is empty`);
2256
+ const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
2257
+ if (envVarMatch) {
2258
+ const varName = envVarMatch[1];
2259
+ const envValue = env[varName];
2260
+ if (envValue !== void 0) {
2261
+ if (envValue.trim().length === 0) {
2262
+ throw new Error(`Environment variable '${varName}' for ${description} is empty`);
2263
+ }
2264
+ return envValue;
2263
2265
  }
2264
- return envValue;
2265
- }
2266
- const allowLiteral = options?.allowLiteral ?? false;
2267
- const optionalEnv = options?.optionalEnv ?? false;
2268
- const looksLikeEnv = isLikelyEnvReference(trimmed);
2269
- if (looksLikeEnv) {
2266
+ const optionalEnv = options?.optionalEnv ?? false;
2270
2267
  if (optionalEnv) {
2271
2268
  return void 0;
2272
2269
  }
2273
- if (!allowLiteral) {
2274
- throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
2275
- }
2270
+ throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
2271
+ }
2272
+ const allowLiteral = options?.allowLiteral ?? false;
2273
+ if (!allowLiteral) {
2274
+ throw new Error(`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`);
2276
2275
  }
2277
2276
  return trimmed;
2278
2277
  }
@@ -2319,9 +2318,6 @@ function resolveOptionalBoolean(source) {
2319
2318
  }
2320
2319
  throw new Error("expected boolean value");
2321
2320
  }
2322
- function isLikelyEnvReference(value) {
2323
- return /^[A-Z0-9_]+$/.test(value);
2324
- }
2325
2321
  function resolveOptionalStringArray(source, env, description) {
2326
2322
  if (source === void 0 || source === null) {
2327
2323
  return void 0;
@@ -2342,21 +2338,25 @@ function resolveOptionalStringArray(source, env, description) {
2342
2338
  if (trimmed.length === 0) {
2343
2339
  throw new Error(`${description}[${i}] cannot be empty`);
2344
2340
  }
2345
- const envValue = env[trimmed];
2346
- if (envValue !== void 0) {
2347
- if (envValue.trim().length === 0) {
2348
- throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
2341
+ const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
2342
+ if (envVarMatch) {
2343
+ const varName = envVarMatch[1];
2344
+ const envValue = env[varName];
2345
+ if (envValue !== void 0) {
2346
+ if (envValue.trim().length === 0) {
2347
+ throw new Error(`Environment variable '${varName}' for ${description}[${i}] is empty`);
2348
+ }
2349
+ resolved.push(envValue);
2350
+ continue;
2349
2351
  }
2350
- resolved.push(envValue);
2351
- } else {
2352
- resolved.push(trimmed);
2352
+ throw new Error(`Environment variable '${varName}' for ${description}[${i}] is not set`);
2353
2353
  }
2354
+ resolved.push(trimmed);
2354
2355
  }
2355
2356
  return resolved.length > 0 ? resolved : void 0;
2356
2357
  }
2357
2358
 
2358
2359
  // src/evaluation/providers/vscode.ts
2359
- import { readFile as readFile2 } from "node:fs/promises";
2360
2360
  import path5 from "node:path";
2361
2361
  import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
2362
2362
  var VSCodeProvider = class {
@@ -2400,7 +2400,7 @@ var VSCodeProvider = class {
2400
2400
  }
2401
2401
  };
2402
2402
  }
2403
- const responseText = await readFile2(session.responseFile, "utf8");
2403
+ const responseText = await readTextFile(session.responseFile);
2404
2404
  return {
2405
2405
  text: responseText,
2406
2406
  raw: {
@@ -2454,7 +2454,7 @@ var VSCodeProvider = class {
2454
2454
  }
2455
2455
  const responses = [];
2456
2456
  for (const [index, responseFile] of session.responseFiles.entries()) {
2457
- const responseText = await readFile2(responseFile, "utf8");
2457
+ const responseText = await readTextFile(responseFile);
2458
2458
  responses.push({
2459
2459
  text: responseText,
2460
2460
  raw: {
@@ -2479,7 +2479,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
2479
2479
  if (prereadBlock.length > 0) {
2480
2480
  parts.push("\n", prereadBlock);
2481
2481
  }
2482
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
2482
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2483
2483
  return parts.join("\n").trim();
2484
2484
  }
2485
2485
  function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
@@ -2604,7 +2604,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
2604
2604
 
2605
2605
  // src/evaluation/providers/targets-file.ts
2606
2606
  import { constants as constants3 } from "node:fs";
2607
- import { access as access3, readFile as readFile3 } from "node:fs/promises";
2607
+ import { access as access3, readFile as readFile2 } from "node:fs/promises";
2608
2608
  import path6 from "node:path";
2609
2609
  import { parse as parse2 } from "yaml";
2610
2610
  function isRecord(value) {
@@ -2672,7 +2672,7 @@ async function readTargetDefinitions(filePath) {
2672
2672
  if (!await fileExists3(absolutePath)) {
2673
2673
  throw new Error(`targets.yaml not found at ${absolutePath}`);
2674
2674
  }
2675
- const raw = await readFile3(absolutePath, "utf8");
2675
+ const raw = await readFile2(absolutePath, "utf8");
2676
2676
  const parsed = parse2(raw);
2677
2677
  if (!isRecord(parsed)) {
2678
2678
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -2716,30 +2716,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
2716
2716
  }
2717
2717
 
2718
2718
  // src/evaluation/evaluators.ts
2719
- import { ax, f } from "@ax-llm/ax";
2720
2719
  import { randomUUID as randomUUID2 } from "node:crypto";
2721
- var LLM_JUDGE_SIGNATURE = f().input(
2722
- "evaluationContext",
2723
- f.object(
2724
- {
2725
- expectedOutcome: f.string("The expected outcome for the original task"),
2726
- request: f.string("The original task request"),
2727
- referenceAnswer: f.string("The gold standard reference answer"),
2728
- generatedAnswer: f.string("The answer to evaluate"),
2729
- guidelines: f.string("Additional evaluation guidelines or instructions").optional()
2730
- },
2731
- "Complete evaluation context for the judge"
2732
- )
2733
- ).output(
2734
- "evaluation",
2735
- f.object({
2736
- score: f.number("Score between 0.0 and 1.0").min(0).max(1),
2737
- hits: f.string("Brief specific achievement").array(),
2738
- misses: f.string("Brief specific failure or omission").array(),
2739
- reasoning: f.string("Concise explanation for the score").max(500)
2740
- })
2741
- ).build();
2742
- var LLM_JUDGE = ax(LLM_JUDGE_SIGNATURE);
2743
2720
  var LlmJudgeEvaluator = class {
2744
2721
  kind = "llm_judge";
2745
2722
  resolveJudgeProvider;
@@ -2757,52 +2734,29 @@ var LlmJudgeEvaluator = class {
2757
2734
  if (!judgeProvider) {
2758
2735
  throw new Error("No judge provider available for LLM grading");
2759
2736
  }
2760
- if (providerSupportsAx(judgeProvider)) {
2761
- return this.evaluateWithAx(context, judgeProvider);
2762
- }
2763
2737
  return this.evaluateWithPrompt(context, judgeProvider);
2764
2738
  }
2765
- async evaluateWithAx(context, judgeProvider) {
2766
- const ai = judgeProvider.getAxAI();
2767
- const guidelines = context.promptInputs.guidelines?.trim();
2768
- const evaluationContext = {
2769
- expectedOutcome: context.evalCase.outcome.trim(),
2770
- request: context.evalCase.task.trim(),
2771
- referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
2772
- generatedAnswer: context.candidate.trim(),
2773
- ...guidelines ? { guidelines } : {}
2774
- };
2775
- const options = this.buildJudgeForwardOptions(context);
2776
- const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
2777
- const evaluation = result.evaluation;
2778
- const expectedAspectCount = Math.max(
2779
- evaluation.hits.length + evaluation.misses.length,
2780
- 1
2781
- );
2782
- return {
2783
- score: evaluation.score,
2784
- hits: evaluation.hits,
2785
- misses: evaluation.misses,
2786
- expectedAspectCount,
2787
- reasoning: evaluation.reasoning,
2788
- evaluatorRawRequest: {
2789
- id: randomUUID2(),
2790
- provider: judgeProvider.id,
2791
- target: context.target.name,
2792
- method: "ax-structured-output",
2793
- signature: LLM_JUDGE_SIGNATURE.toString()
2794
- }
2795
- };
2796
- }
2797
2739
  async evaluateWithPrompt(context, judgeProvider) {
2798
- const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2799
- const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2740
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2741
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2742
+ if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2743
+ const variables = {
2744
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2745
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2746
+ candidate_answer: context.candidate,
2747
+ reference_answer: context.evalCase.reference_answer,
2748
+ expected_outcome: context.evalCase.expected_outcome,
2749
+ question: context.evalCase.question
2750
+ };
2751
+ prompt = substituteVariables(systemPrompt, variables);
2752
+ systemPrompt = QUALITY_SYSTEM_PROMPT;
2753
+ }
2800
2754
  const metadata = {
2801
2755
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
2802
2756
  ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2803
2757
  };
2804
2758
  const response = await judgeProvider.invoke({
2805
- prompt,
2759
+ question: prompt,
2806
2760
  metadata,
2807
2761
  evalCaseId: context.evalCase.id,
2808
2762
  attempt: context.attempt,
@@ -2832,33 +2786,11 @@ var LlmJudgeEvaluator = class {
2832
2786
  evaluatorRawRequest
2833
2787
  };
2834
2788
  }
2835
- buildJudgeForwardOptions(context) {
2836
- const modelConfig = this.buildJudgeModelConfig();
2837
- if (modelConfig === void 0 && context.judgeModel === void 0) {
2838
- return void 0;
2839
- }
2840
- return {
2841
- ...context.judgeModel ? { model: context.judgeModel } : {},
2842
- ...modelConfig ? { modelConfig } : {}
2843
- };
2844
- }
2845
- buildJudgeModelConfig() {
2846
- if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
2847
- return void 0;
2848
- }
2849
- return {
2850
- ...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
2851
- ...this.temperature !== void 0 ? { temperature: this.temperature } : {}
2852
- };
2853
- }
2854
2789
  };
2855
- function providerSupportsAx(provider) {
2856
- return typeof provider.getAxAI === "function";
2857
- }
2858
2790
  var QUALITY_SYSTEM_PROMPT = [
2859
- "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
2791
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2860
2792
  "",
2861
- "Use the reference_answer as a gold standard for a high-quality response. The generated_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2793
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2862
2794
  "",
2863
2795
  "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2864
2796
  "",
@@ -2871,18 +2803,18 @@ var QUALITY_SYSTEM_PROMPT = [
2871
2803
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2872
2804
  "}"
2873
2805
  ].join("\n");
2874
- function buildQualityPrompt(testCase, candidate) {
2806
+ function buildQualityPrompt(evalCase, candidate) {
2875
2807
  const parts = [
2876
2808
  "[[ ## expected_outcome ## ]]",
2877
- testCase.outcome.trim(),
2809
+ evalCase.expected_outcome.trim(),
2878
2810
  "",
2879
- "[[ ## request ## ]]",
2880
- testCase.task.trim(),
2811
+ "[[ ## question ## ]]",
2812
+ evalCase.question.trim(),
2881
2813
  "",
2882
2814
  "[[ ## reference_answer ## ]]",
2883
- testCase.expected_assistant_raw.trim(),
2815
+ evalCase.reference_answer.trim(),
2884
2816
  "",
2885
- "[[ ## generated_answer ## ]]",
2817
+ "[[ ## candidate_answer ## ]]",
2886
2818
  candidate.trim(),
2887
2819
  "",
2888
2820
  "Respond with a single JSON object matching the schema described in the system prompt."
@@ -2982,14 +2914,14 @@ var CodeEvaluator = class {
2982
2914
  async evaluate(context) {
2983
2915
  const inputPayload = JSON.stringify(
2984
2916
  {
2985
- task: context.evalCase.task,
2986
- outcome: context.evalCase.outcome,
2987
- expected: context.evalCase.expected_assistant_raw,
2988
- output: context.candidate,
2917
+ question: context.evalCase.question,
2918
+ expected_outcome: context.evalCase.expected_outcome,
2919
+ reference_answer: context.evalCase.reference_answer,
2920
+ candidate_answer: context.candidate,
2989
2921
  system_message: context.promptInputs.systemMessage ?? "",
2990
2922
  guideline_paths: context.evalCase.guideline_paths,
2991
- attachments: context.evalCase.file_paths,
2992
- user_segments: context.evalCase.user_segments
2923
+ input_files: context.evalCase.file_paths,
2924
+ input_segments: context.evalCase.input_segments
2993
2925
  },
2994
2926
  null,
2995
2927
  2
@@ -3075,6 +3007,14 @@ function parseJsonSafe(payload) {
3075
3007
  return void 0;
3076
3008
  }
3077
3009
  }
3010
+ function hasTemplateVariables(text) {
3011
+ return /\$\{[a-zA-Z0-9_]+\}/.test(text);
3012
+ }
3013
+ function substituteVariables(template, variables) {
3014
+ return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3015
+ return variables[varName] ?? match;
3016
+ });
3017
+ }
3078
3018
 
3079
3019
  // src/evaluation/orchestrator.ts
3080
3020
  import { createHash, randomUUID as randomUUID3 } from "node:crypto";
@@ -3397,7 +3337,8 @@ async function runEvaluation(options) {
3397
3337
  target.name,
3398
3338
  (now ?? (() => /* @__PURE__ */ new Date()))(),
3399
3339
  outcome.reason,
3400
- promptInputs
3340
+ promptInputs,
3341
+ primaryProvider
3401
3342
  );
3402
3343
  results.push(errorResult);
3403
3344
  if (onResult) {
@@ -3431,7 +3372,7 @@ async function runBatchEvaluation(options) {
3431
3372
  const batchRequests = evalCases.map((evalCase, index) => {
3432
3373
  const promptInputs = promptInputsList[index];
3433
3374
  return {
3434
- prompt: promptInputs.request,
3375
+ question: promptInputs.question,
3435
3376
  guidelines: promptInputs.guidelines,
3436
3377
  guideline_patterns: evalCase.guideline_patterns,
3437
3378
  inputFiles: evalCase.file_paths,
@@ -3481,7 +3422,7 @@ async function runBatchEvaluation(options) {
3481
3422
  agentTimeoutMs
3482
3423
  });
3483
3424
  } catch (error) {
3484
- const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3425
+ const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3485
3426
  results.push(errorResult);
3486
3427
  if (onResult) {
3487
3428
  await onResult(errorResult);
@@ -3558,7 +3499,7 @@ async function runEvalCase(options) {
3558
3499
  attempt += 1;
3559
3500
  continue;
3560
3501
  }
3561
- return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3502
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3562
3503
  }
3563
3504
  }
3564
3505
  if (!providerResponse) {
@@ -3567,7 +3508,8 @@ async function runEvalCase(options) {
3567
3508
  target.name,
3568
3509
  nowFn(),
3569
3510
  lastError ?? new Error("Provider did not return a response"),
3570
- promptInputs
3511
+ promptInputs,
3512
+ provider
3571
3513
  );
3572
3514
  }
3573
3515
  if (cacheKey && cache && !cachedResponse) {
@@ -3587,7 +3529,7 @@ async function runEvalCase(options) {
3587
3529
  agentTimeoutMs
3588
3530
  });
3589
3531
  } catch (error) {
3590
- return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3532
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3591
3533
  }
3592
3534
  }
3593
3535
  async function evaluateCandidate(options) {
@@ -3618,8 +3560,8 @@ async function evaluateCandidate(options) {
3618
3560
  });
3619
3561
  const completedAt = nowFn();
3620
3562
  const rawRequest = {
3621
- request: promptInputs.request,
3622
- guidelines: promptInputs.guidelines,
3563
+ question: promptInputs.question,
3564
+ ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3623
3565
  guideline_paths: evalCase.guideline_paths,
3624
3566
  system_message: promptInputs.systemMessage ?? ""
3625
3567
  };
@@ -3630,7 +3572,7 @@ async function evaluateCandidate(options) {
3630
3572
  score: score.score,
3631
3573
  hits: score.hits,
3632
3574
  misses: score.misses,
3633
- model_answer: candidate,
3575
+ candidate_answer: candidate,
3634
3576
  expected_aspect_count: score.expectedAspectCount,
3635
3577
  target: target.name,
3636
3578
  timestamp: completedAt.toISOString(),
@@ -3840,7 +3782,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
3840
3782
  await mkdir2(path7.dirname(filePath), { recursive: true });
3841
3783
  const payload = {
3842
3784
  eval_id: evalCase.id,
3843
- request: promptInputs.request,
3785
+ question: promptInputs.question,
3844
3786
  guidelines: promptInputs.guidelines,
3845
3787
  guideline_paths: evalCase.guideline_paths
3846
3788
  };
@@ -3862,7 +3804,7 @@ async function invokeProvider(provider, options) {
3862
3804
  }
3863
3805
  try {
3864
3806
  return await provider.invoke({
3865
- prompt: promptInputs.request,
3807
+ question: promptInputs.question,
3866
3808
  guidelines: promptInputs.guidelines,
3867
3809
  guideline_patterns: evalCase.guideline_patterns,
3868
3810
  inputFiles: evalCase.file_paths,
@@ -3879,11 +3821,11 @@ async function invokeProvider(provider, options) {
3879
3821
  }
3880
3822
  }
3881
3823
  }
3882
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
3824
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
3883
3825
  const message = error instanceof Error ? error.message : String(error);
3884
3826
  const rawRequest = {
3885
- request: promptInputs.request,
3886
- guidelines: promptInputs.guidelines,
3827
+ question: promptInputs.question,
3828
+ ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3887
3829
  guideline_paths: evalCase.guideline_paths,
3888
3830
  system_message: promptInputs.systemMessage ?? "",
3889
3831
  error: message
@@ -3895,7 +3837,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
3895
3837
  score: 0,
3896
3838
  hits: [],
3897
3839
  misses: [`Error: ${message}`],
3898
- model_answer: `Error occurred: ${message}`,
3840
+ candidate_answer: `Error occurred: ${message}`,
3899
3841
  expected_aspect_count: 0,
3900
3842
  target: targetName,
3901
3843
  timestamp: timestamp.toISOString(),
@@ -3908,7 +3850,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
3908
3850
  hash.update(provider.id);
3909
3851
  hash.update(target.name);
3910
3852
  hash.update(evalCase.id);
3911
- hash.update(promptInputs.request);
3853
+ hash.update(promptInputs.question);
3912
3854
  hash.update(promptInputs.guidelines);
3913
3855
  hash.update(promptInputs.systemMessage ?? "");
3914
3856
  return hash.digest("hex");