@agentv/core 0.5.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4,8 +4,9 @@ import {
4
4
  buildSearchRoots,
5
5
  fileExists,
6
6
  findGitRoot,
7
+ readTextFile,
7
8
  resolveFileReference
8
- } from "./chunk-NL7K4CAK.js";
9
+ } from "./chunk-L7I5UTJU.js";
9
10
 
10
11
  // src/evaluation/types.ts
11
12
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -133,6 +134,87 @@ function extractCodeBlocks(segments) {
133
134
  }
134
135
  return codeBlocks;
135
136
  }
137
+ async function processMessages(options) {
138
+ const {
139
+ messages,
140
+ searchRoots,
141
+ repoRootPath,
142
+ guidelinePatterns,
143
+ guidelinePaths,
144
+ textParts,
145
+ messageType,
146
+ verbose
147
+ } = options;
148
+ const segments = [];
149
+ for (const message of messages) {
150
+ const content = message.content;
151
+ if (typeof content === "string") {
152
+ segments.push({ type: "text", value: content });
153
+ if (textParts) {
154
+ textParts.push(content);
155
+ }
156
+ continue;
157
+ }
158
+ for (const rawSegment of content) {
159
+ if (!isJsonObject(rawSegment)) {
160
+ continue;
161
+ }
162
+ const segmentType = asString(rawSegment.type);
163
+ if (segmentType === "file") {
164
+ const rawValue = asString(rawSegment.value);
165
+ if (!rawValue) {
166
+ continue;
167
+ }
168
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
169
+ rawValue,
170
+ searchRoots
171
+ );
172
+ if (!resolvedPath) {
173
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
174
+ const context = messageType === "input" ? "" : " in expected_messages";
175
+ logWarning(`File not found${context}: ${displayPath}`, attempts);
176
+ continue;
177
+ }
178
+ try {
179
+ const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
180
+ if (messageType === "input" && guidelinePatterns && guidelinePaths) {
181
+ const relativeToRepo = path.relative(repoRootPath, resolvedPath);
182
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
183
+ guidelinePaths.push(path.resolve(resolvedPath));
184
+ if (verbose) {
185
+ console.log(` [Guideline] Found: ${displayPath}`);
186
+ console.log(` Resolved to: ${resolvedPath}`);
187
+ }
188
+ continue;
189
+ }
190
+ }
191
+ segments.push({
192
+ type: "file",
193
+ path: displayPath,
194
+ text: fileContent,
195
+ resolvedPath: path.resolve(resolvedPath)
196
+ });
197
+ if (verbose) {
198
+ const label = messageType === "input" ? "[File]" : "[Expected Output File]";
199
+ console.log(` ${label} Found: ${displayPath}`);
200
+ console.log(` Resolved to: ${resolvedPath}`);
201
+ }
202
+ } catch (error) {
203
+ const context = messageType === "input" ? "" : " expected output";
204
+ logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
205
+ }
206
+ continue;
207
+ }
208
+ const clonedSegment = cloneJsonObject(rawSegment);
209
+ segments.push(clonedSegment);
210
+ const inlineValue = clonedSegment.value;
211
+ if (typeof inlineValue === "string" && textParts) {
212
+ textParts.push(inlineValue);
213
+ }
214
+ }
215
+ }
216
+ return segments;
217
+ }
136
218
  async function loadEvalCases(evalFilePath, repoRoot, options) {
137
219
  const verbose = options?.verbose ?? false;
138
220
  const absoluteTestPath = path.resolve(evalFilePath);
@@ -149,6 +231,9 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
149
231
  throw new Error(`Invalid test file format: ${evalFilePath}`);
150
232
  }
151
233
  const suite = parsed;
234
+ const datasetNameFromSuite = asString(suite.dataset)?.trim();
235
+ const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
236
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
152
237
  const schema = suite.$schema;
153
238
  if (schema !== SCHEMA_EVAL_V2) {
154
239
  const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
@@ -215,77 +300,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
215
300
  }
216
301
  }
217
302
  }
218
- const userSegments = [];
219
303
  const guidelinePaths = [];
220
- const userTextParts = [];
221
- for (const userMessage of userMessages) {
222
- const content = userMessage.content;
223
- if (typeof content === "string") {
224
- userSegments.push({ type: "text", value: content });
225
- userTextParts.push(content);
226
- continue;
227
- }
228
- for (const rawSegment of content) {
229
- if (!isJsonObject(rawSegment)) {
230
- continue;
231
- }
232
- const segmentType = asString(rawSegment.type);
233
- if (segmentType === "file") {
234
- const rawValue = asString(rawSegment.value);
235
- if (!rawValue) {
236
- continue;
237
- }
238
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
239
- rawValue,
240
- searchRoots
241
- );
242
- if (!resolvedPath) {
243
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
244
- logWarning(`File not found: ${displayPath}`, attempts);
245
- continue;
246
- }
247
- try {
248
- const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
249
- const relativeToRepo = path.relative(repoRootPath, resolvedPath);
250
- if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
251
- guidelinePaths.push(path.resolve(resolvedPath));
252
- if (verbose) {
253
- console.log(` [Guideline] Found: ${displayPath}`);
254
- console.log(` Resolved to: ${resolvedPath}`);
255
- }
256
- } else {
257
- userSegments.push({
258
- type: "file",
259
- path: displayPath,
260
- text: fileContent,
261
- resolvedPath: path.resolve(resolvedPath)
262
- });
263
- if (verbose) {
264
- console.log(` [File] Found: ${displayPath}`);
265
- console.log(` Resolved to: ${resolvedPath}`);
266
- }
267
- }
268
- } catch (error) {
269
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
270
- }
271
- continue;
272
- }
273
- const clonedSegment = cloneJsonObject(rawSegment);
274
- userSegments.push(clonedSegment);
275
- const inlineValue = clonedSegment.value;
276
- if (typeof inlineValue === "string") {
277
- userTextParts.push(inlineValue);
278
- }
279
- }
280
- }
281
- const codeSnippets = extractCodeBlocks(userSegments);
304
+ const inputTextParts = [];
305
+ const inputSegments = await processMessages({
306
+ messages: userMessages,
307
+ searchRoots,
308
+ repoRootPath,
309
+ guidelinePatterns,
310
+ guidelinePaths,
311
+ textParts: inputTextParts,
312
+ messageType: "input",
313
+ verbose
314
+ });
315
+ const outputSegments = await processMessages({
316
+ messages: assistantMessages,
317
+ searchRoots,
318
+ repoRootPath,
319
+ guidelinePatterns,
320
+ messageType: "output",
321
+ verbose
322
+ });
323
+ const codeSnippets = extractCodeBlocks(inputSegments);
282
324
  const assistantContent = assistantMessages[0]?.content;
283
- const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
284
- const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
325
+ const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
326
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
285
327
  const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
286
328
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
287
329
  const userFilePaths = [];
288
- for (const segment of userSegments) {
330
+ for (const segment of inputSegments) {
289
331
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
290
332
  userFilePaths.push(segment.resolvedPath);
291
333
  }
@@ -296,16 +338,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
296
338
  ];
297
339
  const testCase = {
298
340
  id,
341
+ dataset: datasetName,
299
342
  conversation_id: conversationId,
300
- task: userTextPrompt,
301
- user_segments: userSegments,
343
+ question,
344
+ input_segments: inputSegments,
345
+ output_segments: outputSegments,
302
346
  system_message: systemMessageContent,
303
- expected_assistant_raw: expectedAssistantRaw,
347
+ reference_answer: referenceAnswer,
304
348
  guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
305
349
  guideline_patterns: guidelinePatterns,
306
350
  file_paths: allFilePaths,
307
351
  code_snippets: codeSnippets,
308
- outcome,
352
+ expected_outcome: outcome,
309
353
  evaluator: testCaseEvaluatorKind,
310
354
  evaluators
311
355
  };
@@ -341,36 +385,36 @@ ${content}`);
341
385
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
342
386
  }
343
387
  }
344
- const requestParts = [];
345
- for (const segment of testCase.user_segments) {
388
+ const questionParts = [];
389
+ for (const segment of testCase.input_segments) {
346
390
  const typeValue = segment.type;
347
391
  if (typeof typeValue === "string" && typeValue === "file") {
348
392
  const pathValue = segment.path;
349
393
  const textValue = segment.text;
350
394
  const label = typeof pathValue === "string" ? pathValue : "file";
351
395
  const body = typeof textValue === "string" ? textValue : "";
352
- requestParts.push(`=== ${label} ===
396
+ questionParts.push(`=== ${label} ===
353
397
  ${body}`);
354
398
  continue;
355
399
  }
356
400
  if (typeof typeValue === "string" && typeValue === "text") {
357
401
  const value = segment.value;
358
402
  if (typeof value === "string") {
359
- requestParts.push(value);
403
+ questionParts.push(value);
360
404
  }
361
405
  continue;
362
406
  }
363
407
  const genericValue = segment.value;
364
408
  if (typeof genericValue === "string") {
365
- requestParts.push(genericValue);
409
+ questionParts.push(genericValue);
366
410
  }
367
411
  }
368
412
  if (testCase.code_snippets.length > 0) {
369
- requestParts.push(testCase.code_snippets.join("\n"));
413
+ questionParts.push(testCase.code_snippets.join("\n"));
370
414
  }
371
- const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
415
+ const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
372
416
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
373
- return { request, guidelines, systemMessage: testCase.system_message };
417
+ return { question, guidelines, systemMessage: testCase.system_message };
374
418
  }
375
419
  async function fileExists2(absolutePath) {
376
420
  try {
@@ -582,7 +626,7 @@ function buildChatPrompt(request) {
582
626
  ${request.guidelines.trim()}`);
583
627
  }
584
628
  const systemContent = systemSegments.join("\n\n");
585
- const userContent = request.prompt.trim();
629
+ const userContent = request.question.trim();
586
630
  const prompt = [
587
631
  {
588
632
  role: "system",
@@ -676,6 +720,9 @@ var AzureProvider = class {
676
720
  );
677
721
  return mapResponse(ensureChatResponse(response));
678
722
  }
723
+ getAxAI() {
724
+ return this.ai;
725
+ }
679
726
  };
680
727
  var AnthropicProvider = class {
681
728
  constructor(targetName, config) {
@@ -710,6 +757,9 @@ var AnthropicProvider = class {
710
757
  );
711
758
  return mapResponse(ensureChatResponse(response));
712
759
  }
760
+ getAxAI() {
761
+ return this.ai;
762
+ }
713
763
  };
714
764
  var GeminiProvider = class {
715
765
  constructor(targetName, config) {
@@ -743,6 +793,9 @@ var GeminiProvider = class {
743
793
  );
744
794
  return mapResponse(ensureChatResponse(response));
745
795
  }
796
+ getAxAI() {
797
+ return this.ai;
798
+ }
746
799
  };
747
800
 
748
801
  // src/evaluation/providers/cli.ts
@@ -871,7 +924,7 @@ var CliProvider = class {
871
924
  healthcheck.commandTemplate,
872
925
  buildTemplateValues(
873
926
  {
874
- prompt: "",
927
+ question: "",
875
928
  guidelines: "",
876
929
  inputFiles: [],
877
930
  evalCaseId: "",
@@ -898,7 +951,7 @@ var CliProvider = class {
898
951
  function buildTemplateValues(request, config) {
899
952
  const inputFiles = normalizeInputFiles(request.inputFiles);
900
953
  return {
901
- PROMPT: shellEscape(request.prompt ?? ""),
954
+ PROMPT: shellEscape(request.question ?? ""),
902
955
  GUIDELINES: shellEscape(request.guidelines ?? ""),
903
956
  EVAL_ID: shellEscape(request.evalCaseId ?? ""),
904
957
  ATTEMPT: shellEscape(String(request.attempt ?? 0)),
@@ -962,6 +1015,59 @@ import { tmpdir } from "node:os";
962
1015
  import path4 from "node:path";
963
1016
  import { promisify as promisify2 } from "node:util";
964
1017
 
1018
+ // src/evaluation/providers/codex-log-tracker.ts
1019
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1020
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1021
+ function getCodexLogStore() {
1022
+ const globalObject = globalThis;
1023
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1024
+ if (existing) {
1025
+ return existing;
1026
+ }
1027
+ const created = [];
1028
+ globalObject[GLOBAL_LOGS_KEY] = created;
1029
+ return created;
1030
+ }
1031
+ function getSubscriberStore() {
1032
+ const globalObject = globalThis;
1033
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1034
+ if (existing) {
1035
+ return existing;
1036
+ }
1037
+ const created = /* @__PURE__ */ new Set();
1038
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1039
+ return created;
1040
+ }
1041
+ function notifySubscribers(entry) {
1042
+ const subscribers = Array.from(getSubscriberStore());
1043
+ for (const listener of subscribers) {
1044
+ try {
1045
+ listener(entry);
1046
+ } catch (error) {
1047
+ const message = error instanceof Error ? error.message : String(error);
1048
+ console.warn(`Codex log subscriber failed: ${message}`);
1049
+ }
1050
+ }
1051
+ }
1052
+ function recordCodexLogEntry(entry) {
1053
+ getCodexLogStore().push(entry);
1054
+ notifySubscribers(entry);
1055
+ }
1056
+ function consumeCodexLogEntries() {
1057
+ const store = getCodexLogStore();
1058
+ if (store.length === 0) {
1059
+ return [];
1060
+ }
1061
+ return store.splice(0, store.length);
1062
+ }
1063
+ function subscribeToCodexLogEntries(listener) {
1064
+ const store = getSubscriberStore();
1065
+ store.add(listener);
1066
+ return () => {
1067
+ store.delete(listener);
1068
+ };
1069
+ }
1070
+
965
1071
  // src/evaluation/providers/preread.ts
966
1072
  import path3 from "node:path";
967
1073
  function buildPromptDocument(request, inputFiles, options) {
@@ -979,7 +1085,7 @@ function buildPromptDocument(request, inputFiles, options) {
979
1085
  if (prereadBlock.length > 0) {
980
1086
  parts.push("\n", prereadBlock);
981
1087
  }
982
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1088
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
983
1089
  return parts.join("\n").trim();
984
1090
  }
985
1091
  function normalizeInputFiles2(inputFiles) {
@@ -1259,7 +1365,12 @@ var CodexProvider = class {
1259
1365
  attempt: request.attempt,
1260
1366
  format: this.config.logFormat ?? "summary"
1261
1367
  });
1262
- console.log(`Streaming Codex CLI output to ${filePath}`);
1368
+ recordCodexLogEntry({
1369
+ filePath,
1370
+ targetName: this.targetName,
1371
+ evalCaseId: request.evalCaseId,
1372
+ attempt: request.attempt
1373
+ });
1263
1374
  return logger;
1264
1375
  } catch (error) {
1265
1376
  const message = error instanceof Error ? error.message : String(error);
@@ -1791,7 +1902,7 @@ var MockProvider = class {
1791
1902
  return {
1792
1903
  text: this.cannedResponse,
1793
1904
  raw: {
1794
- prompt: request.prompt,
1905
+ question: request.question,
1795
1906
  guidelines: request.guidelines
1796
1907
  }
1797
1908
  };
@@ -2407,7 +2518,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
2407
2518
  if (prereadBlock.length > 0) {
2408
2519
  parts.push("\n", prereadBlock);
2409
2520
  }
2410
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
2521
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2411
2522
  return parts.join("\n").trim();
2412
2523
  }
2413
2524
  function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
@@ -2662,14 +2773,29 @@ var LlmJudgeEvaluator = class {
2662
2773
  if (!judgeProvider) {
2663
2774
  throw new Error("No judge provider available for LLM grading");
2664
2775
  }
2665
- const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2666
- const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2776
+ return this.evaluateWithPrompt(context, judgeProvider);
2777
+ }
2778
+ async evaluateWithPrompt(context, judgeProvider) {
2779
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2780
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2781
+ if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2782
+ const variables = {
2783
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2784
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2785
+ candidate_answer: context.candidate,
2786
+ reference_answer: context.evalCase.reference_answer,
2787
+ expected_outcome: context.evalCase.expected_outcome,
2788
+ question: context.evalCase.question
2789
+ };
2790
+ prompt = substituteVariables(systemPrompt, variables);
2791
+ systemPrompt = QUALITY_SYSTEM_PROMPT;
2792
+ }
2667
2793
  const metadata = {
2668
2794
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
2669
2795
  ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2670
2796
  };
2671
2797
  const response = await judgeProvider.invoke({
2672
- prompt,
2798
+ question: prompt,
2673
2799
  metadata,
2674
2800
  evalCaseId: context.evalCase.id,
2675
2801
  attempt: context.attempt,
@@ -2681,6 +2807,7 @@ var LlmJudgeEvaluator = class {
2681
2807
  const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
2682
2808
  const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
2683
2809
  const reasoning = parsed.reasoning ?? response.reasoning;
2810
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
2684
2811
  const evaluatorRawRequest = {
2685
2812
  id: randomUUID2(),
2686
2813
  provider: judgeProvider.id,
@@ -2693,16 +2820,16 @@ var LlmJudgeEvaluator = class {
2693
2820
  score,
2694
2821
  hits,
2695
2822
  misses,
2696
- expectedAspectCount: hits.length + misses.length || 1,
2823
+ expectedAspectCount,
2697
2824
  reasoning,
2698
2825
  evaluatorRawRequest
2699
2826
  };
2700
2827
  }
2701
2828
  };
2702
2829
  var QUALITY_SYSTEM_PROMPT = [
2703
- "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
2830
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2704
2831
  "",
2705
- "Use the reference_answer as a gold standard for a high-quality response. The generated_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2832
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2706
2833
  "",
2707
2834
  "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2708
2835
  "",
@@ -2715,18 +2842,18 @@ var QUALITY_SYSTEM_PROMPT = [
2715
2842
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2716
2843
  "}"
2717
2844
  ].join("\n");
2718
- function buildQualityPrompt(testCase, candidate) {
2845
+ function buildQualityPrompt(evalCase, candidate) {
2719
2846
  const parts = [
2720
2847
  "[[ ## expected_outcome ## ]]",
2721
- testCase.outcome.trim(),
2848
+ evalCase.expected_outcome.trim(),
2722
2849
  "",
2723
- "[[ ## request ## ]]",
2724
- testCase.task.trim(),
2850
+ "[[ ## question ## ]]",
2851
+ evalCase.question.trim(),
2725
2852
  "",
2726
2853
  "[[ ## reference_answer ## ]]",
2727
- testCase.expected_assistant_raw.trim(),
2854
+ evalCase.reference_answer.trim(),
2728
2855
  "",
2729
- "[[ ## generated_answer ## ]]",
2856
+ "[[ ## candidate_answer ## ]]",
2730
2857
  candidate.trim(),
2731
2858
  "",
2732
2859
  "Respond with a single JSON object matching the schema described in the system prompt."
@@ -2826,14 +2953,14 @@ var CodeEvaluator = class {
2826
2953
  async evaluate(context) {
2827
2954
  const inputPayload = JSON.stringify(
2828
2955
  {
2829
- task: context.evalCase.task,
2830
- outcome: context.evalCase.outcome,
2831
- expected: context.evalCase.expected_assistant_raw,
2832
- output: context.candidate,
2956
+ question: context.evalCase.question,
2957
+ expected_outcome: context.evalCase.expected_outcome,
2958
+ reference_answer: context.evalCase.reference_answer,
2959
+ candidate_answer: context.candidate,
2833
2960
  system_message: context.promptInputs.systemMessage ?? "",
2834
2961
  guideline_paths: context.evalCase.guideline_paths,
2835
- attachments: context.evalCase.file_paths,
2836
- user_segments: context.evalCase.user_segments
2962
+ input_files: context.evalCase.file_paths,
2963
+ input_segments: context.evalCase.input_segments
2837
2964
  },
2838
2965
  null,
2839
2966
  2
@@ -2919,10 +3046,18 @@ function parseJsonSafe(payload) {
2919
3046
  return void 0;
2920
3047
  }
2921
3048
  }
3049
+ function hasTemplateVariables(text) {
3050
+ return /\$\{[a-zA-Z0-9_]+\}/.test(text);
3051
+ }
3052
+ function substituteVariables(template, variables) {
3053
+ return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3054
+ return variables[varName] ?? match;
3055
+ });
3056
+ }
2922
3057
 
2923
3058
  // src/evaluation/orchestrator.ts
2924
3059
  import { createHash, randomUUID as randomUUID3 } from "node:crypto";
2925
- import { mkdir as mkdir2, readFile as readFile4, writeFile as writeFile2 } from "node:fs/promises";
3060
+ import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
2926
3061
  import path7 from "node:path";
2927
3062
 
2928
3063
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
@@ -3275,7 +3410,7 @@ async function runBatchEvaluation(options) {
3275
3410
  const batchRequests = evalCases.map((evalCase, index) => {
3276
3411
  const promptInputs = promptInputsList[index];
3277
3412
  return {
3278
- prompt: promptInputs.request,
3413
+ question: promptInputs.question,
3279
3414
  guidelines: promptInputs.guidelines,
3280
3415
  guideline_patterns: evalCase.guideline_patterns,
3281
3416
  inputFiles: evalCase.file_paths,
@@ -3462,18 +3597,19 @@ async function evaluateCandidate(options) {
3462
3597
  });
3463
3598
  const completedAt = nowFn();
3464
3599
  const rawRequest = {
3465
- request: promptInputs.request,
3600
+ question: promptInputs.question,
3466
3601
  guidelines: promptInputs.guidelines,
3467
3602
  guideline_paths: evalCase.guideline_paths,
3468
3603
  system_message: promptInputs.systemMessage ?? ""
3469
3604
  };
3470
3605
  return {
3471
3606
  eval_id: evalCase.id,
3607
+ dataset: evalCase.dataset,
3472
3608
  conversation_id: evalCase.conversation_id,
3473
3609
  score: score.score,
3474
3610
  hits: score.hits,
3475
3611
  misses: score.misses,
3476
- model_answer: candidate,
3612
+ candidate_answer: candidate,
3477
3613
  expected_aspect_count: score.expectedAspectCount,
3478
3614
  target: target.name,
3479
3615
  timestamp: completedAt.toISOString(),
@@ -3645,7 +3781,7 @@ async function runLlmJudgeEvaluator(options) {
3645
3781
  async function resolveCustomPrompt(config) {
3646
3782
  if (config.promptPath) {
3647
3783
  try {
3648
- return await readFile4(config.promptPath, "utf8");
3784
+ return await readTextFile(config.promptPath);
3649
3785
  } catch (error) {
3650
3786
  const message = error instanceof Error ? error.message : String(error);
3651
3787
  console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
@@ -3683,7 +3819,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
3683
3819
  await mkdir2(path7.dirname(filePath), { recursive: true });
3684
3820
  const payload = {
3685
3821
  eval_id: evalCase.id,
3686
- request: promptInputs.request,
3822
+ question: promptInputs.question,
3687
3823
  guidelines: promptInputs.guidelines,
3688
3824
  guideline_paths: evalCase.guideline_paths
3689
3825
  };
@@ -3705,7 +3841,7 @@ async function invokeProvider(provider, options) {
3705
3841
  }
3706
3842
  try {
3707
3843
  return await provider.invoke({
3708
- prompt: promptInputs.request,
3844
+ question: promptInputs.question,
3709
3845
  guidelines: promptInputs.guidelines,
3710
3846
  guideline_patterns: evalCase.guideline_patterns,
3711
3847
  inputFiles: evalCase.file_paths,
@@ -3725,7 +3861,7 @@ async function invokeProvider(provider, options) {
3725
3861
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
3726
3862
  const message = error instanceof Error ? error.message : String(error);
3727
3863
  const rawRequest = {
3728
- request: promptInputs.request,
3864
+ question: promptInputs.question,
3729
3865
  guidelines: promptInputs.guidelines,
3730
3866
  guideline_paths: evalCase.guideline_paths,
3731
3867
  system_message: promptInputs.systemMessage ?? "",
@@ -3733,11 +3869,12 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
3733
3869
  };
3734
3870
  return {
3735
3871
  eval_id: evalCase.id,
3872
+ dataset: evalCase.dataset,
3736
3873
  conversation_id: evalCase.conversation_id,
3737
3874
  score: 0,
3738
3875
  hits: [],
3739
3876
  misses: [`Error: ${message}`],
3740
- model_answer: `Error occurred: ${message}`,
3877
+ candidate_answer: `Error occurred: ${message}`,
3741
3878
  expected_aspect_count: 0,
3742
3879
  target: targetName,
3743
3880
  timestamp: timestamp.toISOString(),
@@ -3750,7 +3887,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
3750
3887
  hash.update(provider.id);
3751
3888
  hash.update(target.name);
3752
3889
  hash.update(evalCase.id);
3753
- hash.update(promptInputs.request);
3890
+ hash.update(promptInputs.question);
3754
3891
  hash.update(promptInputs.guidelines);
3755
3892
  hash.update(promptInputs.systemMessage ?? "");
3756
3893
  return hash.digest("hex");
@@ -3782,6 +3919,7 @@ export {
3782
3919
  buildDirectoryChain,
3783
3920
  buildPromptInputs,
3784
3921
  buildSearchRoots,
3922
+ consumeCodexLogEntries,
3785
3923
  createAgentKernel,
3786
3924
  createProvider,
3787
3925
  ensureVSCodeSubagents,
@@ -3798,10 +3936,12 @@ export {
3798
3936
  listTargetNames,
3799
3937
  loadEvalCases,
3800
3938
  readTargetDefinitions,
3939
+ readTextFile,
3801
3940
  resolveAndCreateProvider,
3802
3941
  resolveFileReference,
3803
3942
  resolveTargetDefinition,
3804
3943
  runEvalCase,
3805
- runEvaluation
3944
+ runEvaluation,
3945
+ subscribeToCodexLogEntries
3806
3946
  };
3807
3947
  //# sourceMappingURL=index.js.map