@agentv/core 0.5.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -36,6 +36,7 @@ __export(index_exports, {
36
36
  buildDirectoryChain: () => buildDirectoryChain,
37
37
  buildPromptInputs: () => buildPromptInputs,
38
38
  buildSearchRoots: () => buildSearchRoots,
39
+ consumeCodexLogEntries: () => consumeCodexLogEntries,
39
40
  createAgentKernel: () => createAgentKernel,
40
41
  createProvider: () => createProvider,
41
42
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
@@ -52,11 +53,13 @@ __export(index_exports, {
52
53
  listTargetNames: () => listTargetNames,
53
54
  loadEvalCases: () => loadEvalCases,
54
55
  readTargetDefinitions: () => readTargetDefinitions,
56
+ readTextFile: () => readTextFile,
55
57
  resolveAndCreateProvider: () => resolveAndCreateProvider,
56
58
  resolveFileReference: () => resolveFileReference,
57
59
  resolveTargetDefinition: () => resolveTargetDefinition,
58
60
  runEvalCase: () => runEvalCase,
59
- runEvaluation: () => runEvaluation
61
+ runEvaluation: () => runEvaluation,
62
+ subscribeToCodexLogEntries: () => subscribeToCodexLogEntries
60
63
  });
61
64
  module.exports = __toCommonJS(index_exports);
62
65
 
@@ -130,6 +133,10 @@ async function fileExists(filePath) {
130
133
  return false;
131
134
  }
132
135
  }
136
+ async function readTextFile(filePath) {
137
+ const content = await (0, import_promises.readFile)(filePath, "utf8");
138
+ return content.replace(/\r\n/g, "\n");
139
+ }
133
140
  async function findGitRoot(startPath) {
134
141
  let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
135
142
  const root = import_node_path.default.parse(currentDir).root;
@@ -292,6 +299,87 @@ function extractCodeBlocks(segments) {
292
299
  }
293
300
  return codeBlocks;
294
301
  }
302
+ async function processMessages(options) {
303
+ const {
304
+ messages,
305
+ searchRoots,
306
+ repoRootPath,
307
+ guidelinePatterns,
308
+ guidelinePaths,
309
+ textParts,
310
+ messageType,
311
+ verbose
312
+ } = options;
313
+ const segments = [];
314
+ for (const message of messages) {
315
+ const content = message.content;
316
+ if (typeof content === "string") {
317
+ segments.push({ type: "text", value: content });
318
+ if (textParts) {
319
+ textParts.push(content);
320
+ }
321
+ continue;
322
+ }
323
+ for (const rawSegment of content) {
324
+ if (!isJsonObject(rawSegment)) {
325
+ continue;
326
+ }
327
+ const segmentType = asString(rawSegment.type);
328
+ if (segmentType === "file") {
329
+ const rawValue = asString(rawSegment.value);
330
+ if (!rawValue) {
331
+ continue;
332
+ }
333
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
334
+ rawValue,
335
+ searchRoots
336
+ );
337
+ if (!resolvedPath) {
338
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
339
+ const context = messageType === "input" ? "" : " in expected_messages";
340
+ logWarning(`File not found${context}: ${displayPath}`, attempts);
341
+ continue;
342
+ }
343
+ try {
344
+ const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
345
+ if (messageType === "input" && guidelinePatterns && guidelinePaths) {
346
+ const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
347
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
348
+ guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
349
+ if (verbose) {
350
+ console.log(` [Guideline] Found: ${displayPath}`);
351
+ console.log(` Resolved to: ${resolvedPath}`);
352
+ }
353
+ continue;
354
+ }
355
+ }
356
+ segments.push({
357
+ type: "file",
358
+ path: displayPath,
359
+ text: fileContent,
360
+ resolvedPath: import_node_path2.default.resolve(resolvedPath)
361
+ });
362
+ if (verbose) {
363
+ const label = messageType === "input" ? "[File]" : "[Expected Output File]";
364
+ console.log(` ${label} Found: ${displayPath}`);
365
+ console.log(` Resolved to: ${resolvedPath}`);
366
+ }
367
+ } catch (error) {
368
+ const context = messageType === "input" ? "" : " expected output";
369
+ logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
370
+ }
371
+ continue;
372
+ }
373
+ const clonedSegment = cloneJsonObject(rawSegment);
374
+ segments.push(clonedSegment);
375
+ const inlineValue = clonedSegment.value;
376
+ if (typeof inlineValue === "string" && textParts) {
377
+ textParts.push(inlineValue);
378
+ }
379
+ }
380
+ }
381
+ return segments;
382
+ }
295
383
  async function loadEvalCases(evalFilePath, repoRoot, options) {
296
384
  const verbose = options?.verbose ?? false;
297
385
  const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
@@ -308,6 +396,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
308
396
  throw new Error(`Invalid test file format: ${evalFilePath}`);
309
397
  }
310
398
  const suite = parsed;
399
+ const datasetNameFromSuite = asString(suite.dataset)?.trim();
400
+ const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
401
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
311
402
  const schema = suite.$schema;
312
403
  if (schema !== SCHEMA_EVAL_V2) {
313
404
  const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
@@ -374,77 +465,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
374
465
  }
375
466
  }
376
467
  }
377
- const userSegments = [];
378
468
  const guidelinePaths = [];
379
- const userTextParts = [];
380
- for (const userMessage of userMessages) {
381
- const content = userMessage.content;
382
- if (typeof content === "string") {
383
- userSegments.push({ type: "text", value: content });
384
- userTextParts.push(content);
385
- continue;
386
- }
387
- for (const rawSegment of content) {
388
- if (!isJsonObject(rawSegment)) {
389
- continue;
390
- }
391
- const segmentType = asString(rawSegment.type);
392
- if (segmentType === "file") {
393
- const rawValue = asString(rawSegment.value);
394
- if (!rawValue) {
395
- continue;
396
- }
397
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
398
- rawValue,
399
- searchRoots
400
- );
401
- if (!resolvedPath) {
402
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
403
- logWarning(`File not found: ${displayPath}`, attempts);
404
- continue;
405
- }
406
- try {
407
- const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
408
- const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
409
- if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
410
- guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
411
- if (verbose) {
412
- console.log(` [Guideline] Found: ${displayPath}`);
413
- console.log(` Resolved to: ${resolvedPath}`);
414
- }
415
- } else {
416
- userSegments.push({
417
- type: "file",
418
- path: displayPath,
419
- text: fileContent,
420
- resolvedPath: import_node_path2.default.resolve(resolvedPath)
421
- });
422
- if (verbose) {
423
- console.log(` [File] Found: ${displayPath}`);
424
- console.log(` Resolved to: ${resolvedPath}`);
425
- }
426
- }
427
- } catch (error) {
428
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
429
- }
430
- continue;
431
- }
432
- const clonedSegment = cloneJsonObject(rawSegment);
433
- userSegments.push(clonedSegment);
434
- const inlineValue = clonedSegment.value;
435
- if (typeof inlineValue === "string") {
436
- userTextParts.push(inlineValue);
437
- }
438
- }
439
- }
440
- const codeSnippets = extractCodeBlocks(userSegments);
469
+ const inputTextParts = [];
470
+ const inputSegments = await processMessages({
471
+ messages: userMessages,
472
+ searchRoots,
473
+ repoRootPath,
474
+ guidelinePatterns,
475
+ guidelinePaths,
476
+ textParts: inputTextParts,
477
+ messageType: "input",
478
+ verbose
479
+ });
480
+ const outputSegments = await processMessages({
481
+ messages: assistantMessages,
482
+ searchRoots,
483
+ repoRootPath,
484
+ guidelinePatterns,
485
+ messageType: "output",
486
+ verbose
487
+ });
488
+ const codeSnippets = extractCodeBlocks(inputSegments);
441
489
  const assistantContent = assistantMessages[0]?.content;
442
- const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
443
- const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
490
+ const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
491
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
444
492
  const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
445
493
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
446
494
  const userFilePaths = [];
447
- for (const segment of userSegments) {
495
+ for (const segment of inputSegments) {
448
496
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
449
497
  userFilePaths.push(segment.resolvedPath);
450
498
  }
@@ -455,16 +503,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
455
503
  ];
456
504
  const testCase = {
457
505
  id,
506
+ dataset: datasetName,
458
507
  conversation_id: conversationId,
459
- task: userTextPrompt,
460
- user_segments: userSegments,
508
+ question,
509
+ input_segments: inputSegments,
510
+ output_segments: outputSegments,
461
511
  system_message: systemMessageContent,
462
- expected_assistant_raw: expectedAssistantRaw,
512
+ reference_answer: referenceAnswer,
463
513
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
464
514
  guideline_patterns: guidelinePatterns,
465
515
  file_paths: allFilePaths,
466
516
  code_snippets: codeSnippets,
467
- outcome,
517
+ expected_outcome: outcome,
468
518
  evaluator: testCaseEvaluatorKind,
469
519
  evaluators
470
520
  };
@@ -500,36 +550,36 @@ ${content}`);
500
550
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
501
551
  }
502
552
  }
503
- const requestParts = [];
504
- for (const segment of testCase.user_segments) {
553
+ const questionParts = [];
554
+ for (const segment of testCase.input_segments) {
505
555
  const typeValue = segment.type;
506
556
  if (typeof typeValue === "string" && typeValue === "file") {
507
557
  const pathValue = segment.path;
508
558
  const textValue = segment.text;
509
559
  const label = typeof pathValue === "string" ? pathValue : "file";
510
560
  const body = typeof textValue === "string" ? textValue : "";
511
- requestParts.push(`=== ${label} ===
561
+ questionParts.push(`=== ${label} ===
512
562
  ${body}`);
513
563
  continue;
514
564
  }
515
565
  if (typeof typeValue === "string" && typeValue === "text") {
516
566
  const value = segment.value;
517
567
  if (typeof value === "string") {
518
- requestParts.push(value);
568
+ questionParts.push(value);
519
569
  }
520
570
  continue;
521
571
  }
522
572
  const genericValue = segment.value;
523
573
  if (typeof genericValue === "string") {
524
- requestParts.push(genericValue);
574
+ questionParts.push(genericValue);
525
575
  }
526
576
  }
527
577
  if (testCase.code_snippets.length > 0) {
528
- requestParts.push(testCase.code_snippets.join("\n"));
578
+ questionParts.push(testCase.code_snippets.join("\n"));
529
579
  }
530
- const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
580
+ const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
531
581
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
532
- return { request, guidelines, systemMessage: testCase.system_message };
582
+ return { question, guidelines, systemMessage: testCase.system_message };
533
583
  }
534
584
  async function fileExists2(absolutePath) {
535
585
  try {
@@ -741,7 +791,7 @@ function buildChatPrompt(request) {
741
791
  ${request.guidelines.trim()}`);
742
792
  }
743
793
  const systemContent = systemSegments.join("\n\n");
744
- const userContent = request.prompt.trim();
794
+ const userContent = request.question.trim();
745
795
  const prompt = [
746
796
  {
747
797
  role: "system",
@@ -835,6 +885,9 @@ var AzureProvider = class {
835
885
  );
836
886
  return mapResponse(ensureChatResponse(response));
837
887
  }
888
+ getAxAI() {
889
+ return this.ai;
890
+ }
838
891
  };
839
892
  var AnthropicProvider = class {
840
893
  constructor(targetName, config) {
@@ -869,6 +922,9 @@ var AnthropicProvider = class {
869
922
  );
870
923
  return mapResponse(ensureChatResponse(response));
871
924
  }
925
+ getAxAI() {
926
+ return this.ai;
927
+ }
872
928
  };
873
929
  var GeminiProvider = class {
874
930
  constructor(targetName, config) {
@@ -902,6 +958,9 @@ var GeminiProvider = class {
902
958
  );
903
959
  return mapResponse(ensureChatResponse(response));
904
960
  }
961
+ getAxAI() {
962
+ return this.ai;
963
+ }
905
964
  };
906
965
 
907
966
  // src/evaluation/providers/cli.ts
@@ -1030,7 +1089,7 @@ var CliProvider = class {
1030
1089
  healthcheck.commandTemplate,
1031
1090
  buildTemplateValues(
1032
1091
  {
1033
- prompt: "",
1092
+ question: "",
1034
1093
  guidelines: "",
1035
1094
  inputFiles: [],
1036
1095
  evalCaseId: "",
@@ -1057,7 +1116,7 @@ var CliProvider = class {
1057
1116
  function buildTemplateValues(request, config) {
1058
1117
  const inputFiles = normalizeInputFiles(request.inputFiles);
1059
1118
  return {
1060
- PROMPT: shellEscape(request.prompt ?? ""),
1119
+ PROMPT: shellEscape(request.question ?? ""),
1061
1120
  GUIDELINES: shellEscape(request.guidelines ?? ""),
1062
1121
  EVAL_ID: shellEscape(request.evalCaseId ?? ""),
1063
1122
  ATTEMPT: shellEscape(String(request.attempt ?? 0)),
@@ -1121,6 +1180,59 @@ var import_node_os = require("os");
1121
1180
  var import_node_path5 = __toESM(require("path"), 1);
1122
1181
  var import_node_util2 = require("util");
1123
1182
 
1183
+ // src/evaluation/providers/codex-log-tracker.ts
1184
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1185
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1186
+ function getCodexLogStore() {
1187
+ const globalObject = globalThis;
1188
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1189
+ if (existing) {
1190
+ return existing;
1191
+ }
1192
+ const created = [];
1193
+ globalObject[GLOBAL_LOGS_KEY] = created;
1194
+ return created;
1195
+ }
1196
+ function getSubscriberStore() {
1197
+ const globalObject = globalThis;
1198
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1199
+ if (existing) {
1200
+ return existing;
1201
+ }
1202
+ const created = /* @__PURE__ */ new Set();
1203
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1204
+ return created;
1205
+ }
1206
+ function notifySubscribers(entry) {
1207
+ const subscribers = Array.from(getSubscriberStore());
1208
+ for (const listener of subscribers) {
1209
+ try {
1210
+ listener(entry);
1211
+ } catch (error) {
1212
+ const message = error instanceof Error ? error.message : String(error);
1213
+ console.warn(`Codex log subscriber failed: ${message}`);
1214
+ }
1215
+ }
1216
+ }
1217
+ function recordCodexLogEntry(entry) {
1218
+ getCodexLogStore().push(entry);
1219
+ notifySubscribers(entry);
1220
+ }
1221
+ function consumeCodexLogEntries() {
1222
+ const store = getCodexLogStore();
1223
+ if (store.length === 0) {
1224
+ return [];
1225
+ }
1226
+ return store.splice(0, store.length);
1227
+ }
1228
+ function subscribeToCodexLogEntries(listener) {
1229
+ const store = getSubscriberStore();
1230
+ store.add(listener);
1231
+ return () => {
1232
+ store.delete(listener);
1233
+ };
1234
+ }
1235
+
1124
1236
  // src/evaluation/providers/preread.ts
1125
1237
  var import_node_path4 = __toESM(require("path"), 1);
1126
1238
  function buildPromptDocument(request, inputFiles, options) {
@@ -1138,7 +1250,7 @@ function buildPromptDocument(request, inputFiles, options) {
1138
1250
  if (prereadBlock.length > 0) {
1139
1251
  parts.push("\n", prereadBlock);
1140
1252
  }
1141
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1253
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
1142
1254
  return parts.join("\n").trim();
1143
1255
  }
1144
1256
  function normalizeInputFiles2(inputFiles) {
@@ -1418,7 +1530,12 @@ var CodexProvider = class {
1418
1530
  attempt: request.attempt,
1419
1531
  format: this.config.logFormat ?? "summary"
1420
1532
  });
1421
- console.log(`Streaming Codex CLI output to ${filePath}`);
1533
+ recordCodexLogEntry({
1534
+ filePath,
1535
+ targetName: this.targetName,
1536
+ evalCaseId: request.evalCaseId,
1537
+ attempt: request.attempt
1538
+ });
1422
1539
  return logger;
1423
1540
  } catch (error) {
1424
1541
  const message = error instanceof Error ? error.message : String(error);
@@ -1950,7 +2067,7 @@ var MockProvider = class {
1950
2067
  return {
1951
2068
  text: this.cannedResponse,
1952
2069
  raw: {
1953
- prompt: request.prompt,
2070
+ question: request.question,
1954
2071
  guidelines: request.guidelines
1955
2072
  }
1956
2073
  };
@@ -2566,7 +2683,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
2566
2683
  if (prereadBlock.length > 0) {
2567
2684
  parts.push("\n", prereadBlock);
2568
2685
  }
2569
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
2686
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2570
2687
  return parts.join("\n").trim();
2571
2688
  }
2572
2689
  function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
@@ -2826,14 +2943,29 @@ var LlmJudgeEvaluator = class {
2826
2943
  if (!judgeProvider) {
2827
2944
  throw new Error("No judge provider available for LLM grading");
2828
2945
  }
2829
- const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2830
- const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2946
+ return this.evaluateWithPrompt(context, judgeProvider);
2947
+ }
2948
+ async evaluateWithPrompt(context, judgeProvider) {
2949
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2950
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2951
+ if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2952
+ const variables = {
2953
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2954
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2955
+ candidate_answer: context.candidate,
2956
+ reference_answer: context.evalCase.reference_answer,
2957
+ expected_outcome: context.evalCase.expected_outcome,
2958
+ question: context.evalCase.question
2959
+ };
2960
+ prompt = substituteVariables(systemPrompt, variables);
2961
+ systemPrompt = QUALITY_SYSTEM_PROMPT;
2962
+ }
2831
2963
  const metadata = {
2832
2964
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
2833
2965
  ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2834
2966
  };
2835
2967
  const response = await judgeProvider.invoke({
2836
- prompt,
2968
+ question: prompt,
2837
2969
  metadata,
2838
2970
  evalCaseId: context.evalCase.id,
2839
2971
  attempt: context.attempt,
@@ -2845,6 +2977,7 @@ var LlmJudgeEvaluator = class {
2845
2977
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
2846
2978
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
2847
2979
  const reasoning = parsed.reasoning ?? response.reasoning;
2980
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
2848
2981
  const evaluatorRawRequest = {
2849
2982
  id: (0, import_node_crypto2.randomUUID)(),
2850
2983
  provider: judgeProvider.id,
@@ -2857,16 +2990,16 @@ var LlmJudgeEvaluator = class {
2857
2990
  score,
2858
2991
  hits,
2859
2992
  misses,
2860
- expectedAspectCount: hits.length + misses.length || 1,
2993
+ expectedAspectCount,
2861
2994
  reasoning,
2862
2995
  evaluatorRawRequest
2863
2996
  };
2864
2997
  }
2865
2998
  };
2866
2999
  var QUALITY_SYSTEM_PROMPT = [
2867
- "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
3000
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2868
3001
  "",
2869
- "Use the reference_answer as a gold standard for a high-quality response. The generated_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
3002
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2870
3003
  "",
2871
3004
  "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2872
3005
  "",
@@ -2879,18 +3012,18 @@ var QUALITY_SYSTEM_PROMPT = [
2879
3012
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2880
3013
  "}"
2881
3014
  ].join("\n");
2882
- function buildQualityPrompt(testCase, candidate) {
3015
+ function buildQualityPrompt(evalCase, candidate) {
2883
3016
  const parts = [
2884
3017
  "[[ ## expected_outcome ## ]]",
2885
- testCase.outcome.trim(),
3018
+ evalCase.expected_outcome.trim(),
2886
3019
  "",
2887
- "[[ ## request ## ]]",
2888
- testCase.task.trim(),
3020
+ "[[ ## question ## ]]",
3021
+ evalCase.question.trim(),
2889
3022
  "",
2890
3023
  "[[ ## reference_answer ## ]]",
2891
- testCase.expected_assistant_raw.trim(),
3024
+ evalCase.reference_answer.trim(),
2892
3025
  "",
2893
- "[[ ## generated_answer ## ]]",
3026
+ "[[ ## candidate_answer ## ]]",
2894
3027
  candidate.trim(),
2895
3028
  "",
2896
3029
  "Respond with a single JSON object matching the schema described in the system prompt."
@@ -2990,14 +3123,14 @@ var CodeEvaluator = class {
2990
3123
  async evaluate(context) {
2991
3124
  const inputPayload = JSON.stringify(
2992
3125
  {
2993
- task: context.evalCase.task,
2994
- outcome: context.evalCase.outcome,
2995
- expected: context.evalCase.expected_assistant_raw,
2996
- output: context.candidate,
3126
+ question: context.evalCase.question,
3127
+ expected_outcome: context.evalCase.expected_outcome,
3128
+ reference_answer: context.evalCase.reference_answer,
3129
+ candidate_answer: context.candidate,
2997
3130
  system_message: context.promptInputs.systemMessage ?? "",
2998
3131
  guideline_paths: context.evalCase.guideline_paths,
2999
- attachments: context.evalCase.file_paths,
3000
- user_segments: context.evalCase.user_segments
3132
+ input_files: context.evalCase.file_paths,
3133
+ input_segments: context.evalCase.input_segments
3001
3134
  },
3002
3135
  null,
3003
3136
  2
@@ -3083,6 +3216,14 @@ function parseJsonSafe(payload) {
3083
3216
  return void 0;
3084
3217
  }
3085
3218
  }
3219
+ function hasTemplateVariables(text) {
3220
+ return /\$\{[a-zA-Z0-9_]+\}/.test(text);
3221
+ }
3222
+ function substituteVariables(template, variables) {
3223
+ return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3224
+ return variables[varName] ?? match;
3225
+ });
3226
+ }
3086
3227
 
3087
3228
  // src/evaluation/orchestrator.ts
3088
3229
  var import_node_crypto3 = require("crypto");
@@ -3439,7 +3580,7 @@ async function runBatchEvaluation(options) {
3439
3580
  const batchRequests = evalCases.map((evalCase, index) => {
3440
3581
  const promptInputs = promptInputsList[index];
3441
3582
  return {
3442
- prompt: promptInputs.request,
3583
+ question: promptInputs.question,
3443
3584
  guidelines: promptInputs.guidelines,
3444
3585
  guideline_patterns: evalCase.guideline_patterns,
3445
3586
  inputFiles: evalCase.file_paths,
@@ -3626,18 +3767,19 @@ async function evaluateCandidate(options) {
3626
3767
  });
3627
3768
  const completedAt = nowFn();
3628
3769
  const rawRequest = {
3629
- request: promptInputs.request,
3770
+ question: promptInputs.question,
3630
3771
  guidelines: promptInputs.guidelines,
3631
3772
  guideline_paths: evalCase.guideline_paths,
3632
3773
  system_message: promptInputs.systemMessage ?? ""
3633
3774
  };
3634
3775
  return {
3635
3776
  eval_id: evalCase.id,
3777
+ dataset: evalCase.dataset,
3636
3778
  conversation_id: evalCase.conversation_id,
3637
3779
  score: score.score,
3638
3780
  hits: score.hits,
3639
3781
  misses: score.misses,
3640
- model_answer: candidate,
3782
+ candidate_answer: candidate,
3641
3783
  expected_aspect_count: score.expectedAspectCount,
3642
3784
  target: target.name,
3643
3785
  timestamp: completedAt.toISOString(),
@@ -3809,7 +3951,7 @@ async function runLlmJudgeEvaluator(options) {
3809
3951
  async function resolveCustomPrompt(config) {
3810
3952
  if (config.promptPath) {
3811
3953
  try {
3812
- return await (0, import_promises6.readFile)(config.promptPath, "utf8");
3954
+ return await readTextFile(config.promptPath);
3813
3955
  } catch (error) {
3814
3956
  const message = error instanceof Error ? error.message : String(error);
3815
3957
  console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
@@ -3847,7 +3989,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
3847
3989
  await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
3848
3990
  const payload = {
3849
3991
  eval_id: evalCase.id,
3850
- request: promptInputs.request,
3992
+ question: promptInputs.question,
3851
3993
  guidelines: promptInputs.guidelines,
3852
3994
  guideline_paths: evalCase.guideline_paths
3853
3995
  };
@@ -3869,7 +4011,7 @@ async function invokeProvider(provider, options) {
3869
4011
  }
3870
4012
  try {
3871
4013
  return await provider.invoke({
3872
- prompt: promptInputs.request,
4014
+ question: promptInputs.question,
3873
4015
  guidelines: promptInputs.guidelines,
3874
4016
  guideline_patterns: evalCase.guideline_patterns,
3875
4017
  inputFiles: evalCase.file_paths,
@@ -3889,7 +4031,7 @@ async function invokeProvider(provider, options) {
3889
4031
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
3890
4032
  const message = error instanceof Error ? error.message : String(error);
3891
4033
  const rawRequest = {
3892
- request: promptInputs.request,
4034
+ question: promptInputs.question,
3893
4035
  guidelines: promptInputs.guidelines,
3894
4036
  guideline_paths: evalCase.guideline_paths,
3895
4037
  system_message: promptInputs.systemMessage ?? "",
@@ -3897,11 +4039,12 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
3897
4039
  };
3898
4040
  return {
3899
4041
  eval_id: evalCase.id,
4042
+ dataset: evalCase.dataset,
3900
4043
  conversation_id: evalCase.conversation_id,
3901
4044
  score: 0,
3902
4045
  hits: [],
3903
4046
  misses: [`Error: ${message}`],
3904
- model_answer: `Error occurred: ${message}`,
4047
+ candidate_answer: `Error occurred: ${message}`,
3905
4048
  expected_aspect_count: 0,
3906
4049
  target: targetName,
3907
4050
  timestamp: timestamp.toISOString(),
@@ -3914,7 +4057,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
3914
4057
  hash.update(provider.id);
3915
4058
  hash.update(target.name);
3916
4059
  hash.update(evalCase.id);
3917
- hash.update(promptInputs.request);
4060
+ hash.update(promptInputs.question);
3918
4061
  hash.update(promptInputs.guidelines);
3919
4062
  hash.update(promptInputs.systemMessage ?? "");
3920
4063
  return hash.digest("hex");
@@ -3947,6 +4090,7 @@ function createAgentKernel() {
3947
4090
  buildDirectoryChain,
3948
4091
  buildPromptInputs,
3949
4092
  buildSearchRoots,
4093
+ consumeCodexLogEntries,
3950
4094
  createAgentKernel,
3951
4095
  createProvider,
3952
4096
  ensureVSCodeSubagents,
@@ -3963,10 +4107,12 @@ function createAgentKernel() {
3963
4107
  listTargetNames,
3964
4108
  loadEvalCases,
3965
4109
  readTargetDefinitions,
4110
+ readTextFile,
3966
4111
  resolveAndCreateProvider,
3967
4112
  resolveFileReference,
3968
4113
  resolveTargetDefinition,
3969
4114
  runEvalCase,
3970
- runEvaluation
4115
+ runEvaluation,
4116
+ subscribeToCodexLogEntries
3971
4117
  });
3972
4118
  //# sourceMappingURL=index.cjs.map