@agentv/core 0.7.0 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -95,7 +95,7 @@ type LlmJudgeEvaluatorConfig = {
95
95
  };
96
96
  type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
97
97
  /**
98
- * Test case definition sourced from AgentV specs.
98
+ * Eval case definition sourced from AgentV specs.
99
99
  */
100
100
  interface EvalCase {
101
101
  readonly id: string;
@@ -104,7 +104,6 @@ interface EvalCase {
104
104
  readonly question: string;
105
105
  readonly input_segments: readonly JsonObject[];
106
106
  readonly output_segments: readonly JsonObject[];
107
- readonly system_message?: string;
108
107
  readonly reference_answer: string;
109
108
  readonly guideline_paths: readonly string[];
110
109
  readonly guideline_patterns?: readonly string[];
@@ -115,7 +114,7 @@ interface EvalCase {
115
114
  readonly evaluators?: readonly EvaluatorConfig[];
116
115
  }
117
116
  /**
118
- * Evaluator scorecard for a single test case run.
117
+ * Evaluator scorecard for a single eval case run.
119
118
  */
120
119
  interface EvaluationResult {
121
120
  readonly eval_id: string;
@@ -159,6 +158,7 @@ declare function isGuidelineFile(filePath: string, patterns?: readonly string[])
159
158
  declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
160
159
  type LoadOptions = {
161
160
  readonly verbose?: boolean;
161
+ readonly evalId?: string;
162
162
  };
163
163
  /**
164
164
  * Load eval cases from a AgentV YAML specification file.
package/dist/index.d.ts CHANGED
@@ -95,7 +95,7 @@ type LlmJudgeEvaluatorConfig = {
95
95
  };
96
96
  type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
97
97
  /**
98
- * Test case definition sourced from AgentV specs.
98
+ * Eval case definition sourced from AgentV specs.
99
99
  */
100
100
  interface EvalCase {
101
101
  readonly id: string;
@@ -104,7 +104,6 @@ interface EvalCase {
104
104
  readonly question: string;
105
105
  readonly input_segments: readonly JsonObject[];
106
106
  readonly output_segments: readonly JsonObject[];
107
- readonly system_message?: string;
108
107
  readonly reference_answer: string;
109
108
  readonly guideline_paths: readonly string[];
110
109
  readonly guideline_patterns?: readonly string[];
@@ -115,7 +114,7 @@ interface EvalCase {
115
114
  readonly evaluators?: readonly EvaluatorConfig[];
116
115
  }
117
116
  /**
118
- * Evaluator scorecard for a single test case run.
117
+ * Evaluator scorecard for a single eval case run.
119
118
  */
120
119
  interface EvaluationResult {
121
120
  readonly eval_id: string;
@@ -159,6 +158,7 @@ declare function isGuidelineFile(filePath: string, patterns?: readonly string[])
159
158
  declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
160
159
  type LoadOptions = {
161
160
  readonly verbose?: boolean;
161
+ readonly evalId?: string;
162
162
  };
163
163
  /**
164
164
  * Load eval cases from a AgentV YAML specification file.
package/dist/index.js CHANGED
@@ -4,9 +4,10 @@ import {
4
4
  buildSearchRoots,
5
5
  fileExists,
6
6
  findGitRoot,
7
+ isAgentProvider,
7
8
  readTextFile,
8
9
  resolveFileReference
9
- } from "./chunk-L7I5UTJU.js";
10
+ } from "./chunk-UQLHF3T7.js";
10
11
 
11
12
  // src/evaluation/types.ts
12
13
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -217,6 +218,7 @@ async function processMessages(options) {
217
218
  }
218
219
  async function loadEvalCases(evalFilePath, repoRoot, options) {
219
220
  const verbose = options?.verbose ?? false;
221
+ const evalIdFilter = options?.evalId;
220
222
  const absoluteTestPath = path.resolve(evalFilePath);
221
223
  if (!await fileExists2(absoluteTestPath)) {
222
224
  throw new Error(`Test file not found: ${evalFilePath}`);
@@ -248,62 +250,39 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
248
250
  const results = [];
249
251
  for (const rawEvalcase of rawTestcases) {
250
252
  if (!isJsonObject(rawEvalcase)) {
251
- logWarning("Skipping invalid test case entry (expected object)");
253
+ logWarning("Skipping invalid eval case entry (expected object)");
252
254
  continue;
253
255
  }
254
256
  const evalcase = rawEvalcase;
255
257
  const id = asString(evalcase.id);
258
+ if (evalIdFilter && id !== evalIdFilter) {
259
+ continue;
260
+ }
256
261
  const conversationId = asString(evalcase.conversation_id);
257
262
  const outcome = asString(evalcase.outcome);
258
263
  const inputMessagesValue = evalcase.input_messages;
259
264
  const expectedMessagesValue = evalcase.expected_messages;
260
265
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
261
- logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
266
+ logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
262
267
  continue;
263
268
  }
264
269
  if (!Array.isArray(expectedMessagesValue)) {
265
- logWarning(`Test case '${id}' missing expected_messages array`);
270
+ logWarning(`Eval case '${id}' missing expected_messages array`);
266
271
  continue;
267
272
  }
268
273
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
269
274
  const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
270
- const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
271
- const userMessages = inputMessages.filter((message) => message.role === "user");
272
- const systemMessages = inputMessages.filter((message) => message.role === "system");
273
- if (assistantMessages.length === 0) {
274
- logWarning(`No assistant message found for test case: ${id}`);
275
+ if (expectedMessages.length === 0) {
276
+ logWarning(`No expected message found for eval case: ${id}`);
275
277
  continue;
276
278
  }
277
- if (assistantMessages.length > 1) {
278
- logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
279
- }
280
- if (systemMessages.length > 1) {
281
- logWarning(`Multiple system messages found for test case: ${id}, using first`);
282
- }
283
- let systemMessageContent;
284
- if (systemMessages.length > 0) {
285
- const content = systemMessages[0]?.content;
286
- if (typeof content === "string") {
287
- systemMessageContent = content;
288
- } else if (Array.isArray(content)) {
289
- const textParts = [];
290
- for (const segment of content) {
291
- if (isJsonObject(segment)) {
292
- const value = segment.value;
293
- if (typeof value === "string") {
294
- textParts.push(value);
295
- }
296
- }
297
- }
298
- if (textParts.length > 0) {
299
- systemMessageContent = textParts.join("\n\n");
300
- }
301
- }
279
+ if (expectedMessages.length > 1) {
280
+ logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
302
281
  }
303
282
  const guidelinePaths = [];
304
283
  const inputTextParts = [];
305
284
  const inputSegments = await processMessages({
306
- messages: userMessages,
285
+ messages: inputMessages,
307
286
  searchRoots,
308
287
  repoRootPath,
309
288
  guidelinePatterns,
@@ -313,7 +292,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
313
292
  verbose
314
293
  });
315
294
  const outputSegments = await processMessages({
316
- messages: assistantMessages,
295
+ messages: expectedMessages,
317
296
  searchRoots,
318
297
  repoRootPath,
319
298
  guidelinePatterns,
@@ -321,10 +300,10 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
321
300
  verbose
322
301
  });
323
302
  const codeSnippets = extractCodeBlocks(inputSegments);
324
- const assistantContent = assistantMessages[0]?.content;
325
- const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
303
+ const expectedContent = expectedMessages[0]?.content;
304
+ const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
326
305
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
327
- const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
306
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
328
307
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
329
308
  const userFilePaths = [];
330
309
  for (const segment of inputSegments) {
@@ -343,19 +322,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
343
322
  question,
344
323
  input_segments: inputSegments,
345
324
  output_segments: outputSegments,
346
- system_message: systemMessageContent,
347
325
  reference_answer: referenceAnswer,
348
326
  guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
349
327
  guideline_patterns: guidelinePatterns,
350
328
  file_paths: allFilePaths,
351
329
  code_snippets: codeSnippets,
352
330
  expected_outcome: outcome,
353
- evaluator: testCaseEvaluatorKind,
331
+ evaluator: evalCaseEvaluatorKind,
354
332
  evaluators
355
333
  };
356
334
  if (verbose) {
357
335
  console.log(`
358
- [Test Case: ${id}]`);
336
+ [Eval Case: ${id}]`);
359
337
  if (testCase.guideline_paths.length > 0) {
360
338
  console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
361
339
  for (const guidelinePath of testCase.guideline_paths) {
@@ -414,7 +392,7 @@ ${body}`);
414
392
  }
415
393
  const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
416
394
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
417
- return { question, guidelines, systemMessage: testCase.system_message };
395
+ return { question, guidelines };
418
396
  }
419
397
  async function fileExists2(absolutePath) {
420
398
  try {
@@ -1010,7 +988,7 @@ function formatTimeoutSuffix(timeoutMs) {
1010
988
  import { exec as execCallback, spawn } from "node:child_process";
1011
989
  import { randomUUID } from "node:crypto";
1012
990
  import { constants as constants2, createWriteStream } from "node:fs";
1013
- import { access as access2, copyFile, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
991
+ import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
1014
992
  import { tmpdir } from "node:os";
1015
993
  import path4 from "node:path";
1016
994
  import { promisify as promisify2 } from "node:util";
@@ -1173,7 +1151,6 @@ function pathToFileUri(filePath) {
1173
1151
  var execAsync2 = promisify2(execCallback);
1174
1152
  var WORKSPACE_PREFIX = "agentv-codex-";
1175
1153
  var PROMPT_FILENAME = "prompt.md";
1176
- var FILES_DIR = "files";
1177
1154
  var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
1178
1155
  var CodexProvider = class {
1179
1156
  id;
@@ -1196,21 +1173,10 @@ var CodexProvider = class {
1196
1173
  }
1197
1174
  await this.ensureEnvironmentReady();
1198
1175
  const inputFiles = normalizeInputFiles2(request.inputFiles);
1199
- const originalGuidelines = new Set(
1200
- collectGuidelineFiles(inputFiles, request.guideline_patterns).map((file) => path4.resolve(file))
1201
- );
1202
1176
  const workspaceRoot = await this.createWorkspace();
1203
1177
  const logger = await this.createStreamLogger(request).catch(() => void 0);
1204
1178
  try {
1205
- const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
1206
- inputFiles,
1207
- workspaceRoot,
1208
- originalGuidelines
1209
- );
1210
- const promptContent = buildPromptDocument(request, mirroredInputFiles, {
1211
- guidelinePatterns: request.guideline_patterns,
1212
- guidelineOverrides: guidelineMirrors
1213
- });
1179
+ const promptContent = buildPromptDocument(request, inputFiles);
1214
1180
  const promptFile = path4.join(workspaceRoot, PROMPT_FILENAME);
1215
1181
  await writeFile(promptFile, promptContent, "utf8");
1216
1182
  const args = this.buildCodexArgs();
@@ -1239,7 +1205,7 @@ var CodexProvider = class {
1239
1205
  executable: this.resolvedExecutable ?? this.config.executable,
1240
1206
  promptFile,
1241
1207
  workspace: workspaceRoot,
1242
- inputFiles: mirroredInputFiles,
1208
+ inputFiles,
1243
1209
  logFile: logger?.filePath
1244
1210
  }
1245
1211
  };
@@ -1294,37 +1260,6 @@ var CodexProvider = class {
1294
1260
  throw error;
1295
1261
  }
1296
1262
  }
1297
- async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
1298
- if (!inputFiles || inputFiles.length === 0) {
1299
- return {
1300
- mirroredInputFiles: void 0,
1301
- guidelineMirrors: /* @__PURE__ */ new Set()
1302
- };
1303
- }
1304
- const filesRoot = path4.join(workspaceRoot, FILES_DIR);
1305
- await mkdir(filesRoot, { recursive: true });
1306
- const mirrored = [];
1307
- const guidelineMirrors = /* @__PURE__ */ new Set();
1308
- const nameCounts = /* @__PURE__ */ new Map();
1309
- for (const inputFile of inputFiles) {
1310
- const absoluteSource = path4.resolve(inputFile);
1311
- const baseName = path4.basename(absoluteSource);
1312
- const count = nameCounts.get(baseName) ?? 0;
1313
- nameCounts.set(baseName, count + 1);
1314
- const finalName = count === 0 ? baseName : `${baseName}.${count}`;
1315
- const destination = path4.join(filesRoot, finalName);
1316
- await copyFile(absoluteSource, destination);
1317
- const resolvedDestination = path4.resolve(destination);
1318
- mirrored.push(resolvedDestination);
1319
- if (guidelineOriginals.has(absoluteSource)) {
1320
- guidelineMirrors.add(resolvedDestination);
1321
- }
1322
- }
1323
- return {
1324
- mirroredInputFiles: mirrored,
1325
- guidelineMirrors
1326
- };
1327
- }
1328
1263
  async createWorkspace() {
1329
1264
  return await mkdtemp(path4.join(tmpdir(), WORKSPACE_PREFIX));
1330
1265
  }
@@ -2295,23 +2230,25 @@ function resolveOptionalString(source, env, description, options) {
2295
2230
  if (trimmed.length === 0) {
2296
2231
  return void 0;
2297
2232
  }
2298
- const envValue = env[trimmed];
2299
- if (envValue !== void 0) {
2300
- if (envValue.trim().length === 0) {
2301
- throw new Error(`Environment variable '${trimmed}' for ${description} is empty`);
2233
+ const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
2234
+ if (envVarMatch) {
2235
+ const varName = envVarMatch[1];
2236
+ const envValue = env[varName];
2237
+ if (envValue !== void 0) {
2238
+ if (envValue.trim().length === 0) {
2239
+ throw new Error(`Environment variable '${varName}' for ${description} is empty`);
2240
+ }
2241
+ return envValue;
2302
2242
  }
2303
- return envValue;
2304
- }
2305
- const allowLiteral = options?.allowLiteral ?? false;
2306
- const optionalEnv = options?.optionalEnv ?? false;
2307
- const looksLikeEnv = isLikelyEnvReference(trimmed);
2308
- if (looksLikeEnv) {
2243
+ const optionalEnv = options?.optionalEnv ?? false;
2309
2244
  if (optionalEnv) {
2310
2245
  return void 0;
2311
2246
  }
2312
- if (!allowLiteral) {
2313
- throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
2314
- }
2247
+ throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
2248
+ }
2249
+ const allowLiteral = options?.allowLiteral ?? false;
2250
+ if (!allowLiteral) {
2251
+ throw new Error(`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`);
2315
2252
  }
2316
2253
  return trimmed;
2317
2254
  }
@@ -2358,9 +2295,6 @@ function resolveOptionalBoolean(source) {
2358
2295
  }
2359
2296
  throw new Error("expected boolean value");
2360
2297
  }
2361
- function isLikelyEnvReference(value) {
2362
- return /^[A-Z0-9_]+$/.test(value);
2363
- }
2364
2298
  function resolveOptionalStringArray(source, env, description) {
2365
2299
  if (source === void 0 || source === null) {
2366
2300
  return void 0;
@@ -2381,21 +2315,25 @@ function resolveOptionalStringArray(source, env, description) {
2381
2315
  if (trimmed.length === 0) {
2382
2316
  throw new Error(`${description}[${i}] cannot be empty`);
2383
2317
  }
2384
- const envValue = env[trimmed];
2385
- if (envValue !== void 0) {
2386
- if (envValue.trim().length === 0) {
2387
- throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
2318
+ const envVarMatch = trimmed.match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
2319
+ if (envVarMatch) {
2320
+ const varName = envVarMatch[1];
2321
+ const envValue = env[varName];
2322
+ if (envValue !== void 0) {
2323
+ if (envValue.trim().length === 0) {
2324
+ throw new Error(`Environment variable '${varName}' for ${description}[${i}] is empty`);
2325
+ }
2326
+ resolved.push(envValue);
2327
+ continue;
2388
2328
  }
2389
- resolved.push(envValue);
2390
- } else {
2391
- resolved.push(trimmed);
2329
+ throw new Error(`Environment variable '${varName}' for ${description}[${i}] is not set`);
2392
2330
  }
2331
+ resolved.push(trimmed);
2393
2332
  }
2394
2333
  return resolved.length > 0 ? resolved : void 0;
2395
2334
  }
2396
2335
 
2397
2336
  // src/evaluation/providers/vscode.ts
2398
- import { readFile as readFile2 } from "node:fs/promises";
2399
2337
  import path5 from "node:path";
2400
2338
  import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
2401
2339
  var VSCodeProvider = class {
@@ -2439,7 +2377,7 @@ var VSCodeProvider = class {
2439
2377
  }
2440
2378
  };
2441
2379
  }
2442
- const responseText = await readFile2(session.responseFile, "utf8");
2380
+ const responseText = await readTextFile(session.responseFile);
2443
2381
  return {
2444
2382
  text: responseText,
2445
2383
  raw: {
@@ -2493,7 +2431,7 @@ var VSCodeProvider = class {
2493
2431
  }
2494
2432
  const responses = [];
2495
2433
  for (const [index, responseFile] of session.responseFiles.entries()) {
2496
- const responseText = await readFile2(responseFile, "utf8");
2434
+ const responseText = await readTextFile(responseFile);
2497
2435
  responses.push({
2498
2436
  text: responseText,
2499
2437
  raw: {
@@ -2643,7 +2581,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
2643
2581
 
2644
2582
  // src/evaluation/providers/targets-file.ts
2645
2583
  import { constants as constants3 } from "node:fs";
2646
- import { access as access3, readFile as readFile3 } from "node:fs/promises";
2584
+ import { access as access3, readFile as readFile2 } from "node:fs/promises";
2647
2585
  import path6 from "node:path";
2648
2586
  import { parse as parse2 } from "yaml";
2649
2587
  function isRecord(value) {
@@ -2711,7 +2649,7 @@ async function readTargetDefinitions(filePath) {
2711
2649
  if (!await fileExists3(absolutePath)) {
2712
2650
  throw new Error(`targets.yaml not found at ${absolutePath}`);
2713
2651
  }
2714
- const raw = await readFile3(absolutePath, "utf8");
2652
+ const raw = await readFile2(absolutePath, "utf8");
2715
2653
  const parsed = parse2(raw);
2716
2654
  if (!isRecord(parsed)) {
2717
2655
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -2957,7 +2895,6 @@ var CodeEvaluator = class {
2957
2895
  expected_outcome: context.evalCase.expected_outcome,
2958
2896
  reference_answer: context.evalCase.reference_answer,
2959
2897
  candidate_answer: context.candidate,
2960
- system_message: context.promptInputs.systemMessage ?? "",
2961
2898
  guideline_paths: context.evalCase.guideline_paths,
2962
2899
  input_files: context.evalCase.file_paths,
2963
2900
  input_segments: context.evalCase.input_segments
@@ -3199,7 +3136,7 @@ function validateConcurrency(concurrency) {
3199
3136
  // src/evaluation/orchestrator.ts
3200
3137
  async function runEvaluation(options) {
3201
3138
  const {
3202
- testFilePath,
3139
+ testFilePath: evalFilePath,
3203
3140
  repoRoot,
3204
3141
  target,
3205
3142
  targets,
@@ -3218,11 +3155,11 @@ async function runEvaluation(options) {
3218
3155
  onProgress
3219
3156
  } = options;
3220
3157
  const load = loadEvalCases;
3221
- const evalCases = await load(testFilePath, repoRoot, { verbose });
3158
+ const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
3222
3159
  const filteredEvalCases = filterEvalCases(evalCases, evalId);
3223
3160
  if (filteredEvalCases.length === 0) {
3224
3161
  if (evalId) {
3225
- throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
3162
+ throw new Error(`Eval case with id '${evalId}' not found in ${evalFilePath}`);
3226
3163
  }
3227
3164
  return [];
3228
3165
  }
@@ -3376,7 +3313,8 @@ async function runEvaluation(options) {
3376
3313
  target.name,
3377
3314
  (now ?? (() => /* @__PURE__ */ new Date()))(),
3378
3315
  outcome.reason,
3379
- promptInputs
3316
+ promptInputs,
3317
+ primaryProvider
3380
3318
  );
3381
3319
  results.push(errorResult);
3382
3320
  if (onResult) {
@@ -3460,7 +3398,7 @@ async function runBatchEvaluation(options) {
3460
3398
  agentTimeoutMs
3461
3399
  });
3462
3400
  } catch (error) {
3463
- const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3401
+ const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3464
3402
  results.push(errorResult);
3465
3403
  if (onResult) {
3466
3404
  await onResult(errorResult);
@@ -3537,7 +3475,7 @@ async function runEvalCase(options) {
3537
3475
  attempt += 1;
3538
3476
  continue;
3539
3477
  }
3540
- return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3478
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3541
3479
  }
3542
3480
  }
3543
3481
  if (!providerResponse) {
@@ -3546,7 +3484,8 @@ async function runEvalCase(options) {
3546
3484
  target.name,
3547
3485
  nowFn(),
3548
3486
  lastError ?? new Error("Provider did not return a response"),
3549
- promptInputs
3487
+ promptInputs,
3488
+ provider
3550
3489
  );
3551
3490
  }
3552
3491
  if (cacheKey && cache && !cachedResponse) {
@@ -3566,7 +3505,7 @@ async function runEvalCase(options) {
3566
3505
  agentTimeoutMs
3567
3506
  });
3568
3507
  } catch (error) {
3569
- return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
3508
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3570
3509
  }
3571
3510
  }
3572
3511
  async function evaluateCandidate(options) {
@@ -3598,9 +3537,8 @@ async function evaluateCandidate(options) {
3598
3537
  const completedAt = nowFn();
3599
3538
  const rawRequest = {
3600
3539
  question: promptInputs.question,
3601
- guidelines: promptInputs.guidelines,
3602
- guideline_paths: evalCase.guideline_paths,
3603
- system_message: promptInputs.systemMessage ?? ""
3540
+ ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3541
+ guideline_paths: evalCase.guideline_paths
3604
3542
  };
3605
3543
  return {
3606
3544
  eval_id: evalCase.id,
@@ -3858,13 +3796,12 @@ async function invokeProvider(provider, options) {
3858
3796
  }
3859
3797
  }
3860
3798
  }
3861
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
3799
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
3862
3800
  const message = error instanceof Error ? error.message : String(error);
3863
3801
  const rawRequest = {
3864
3802
  question: promptInputs.question,
3865
- guidelines: promptInputs.guidelines,
3803
+ ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3866
3804
  guideline_paths: evalCase.guideline_paths,
3867
- system_message: promptInputs.systemMessage ?? "",
3868
3805
  error: message
3869
3806
  };
3870
3807
  return {