@agentv/core 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -6,7 +6,7 @@ import {
6
6
  findGitRoot,
7
7
  readTextFile,
8
8
  resolveFileReference
9
- } from "./chunk-OW3SHBIJ.js";
9
+ } from "./chunk-L7I5UTJU.js";
10
10
 
11
11
  // src/evaluation/types.ts
12
12
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -134,6 +134,87 @@ function extractCodeBlocks(segments) {
134
134
  }
135
135
  return codeBlocks;
136
136
  }
137
+ async function processMessages(options) {
138
+ const {
139
+ messages,
140
+ searchRoots,
141
+ repoRootPath,
142
+ guidelinePatterns,
143
+ guidelinePaths,
144
+ textParts,
145
+ messageType,
146
+ verbose
147
+ } = options;
148
+ const segments = [];
149
+ for (const message of messages) {
150
+ const content = message.content;
151
+ if (typeof content === "string") {
152
+ segments.push({ type: "text", value: content });
153
+ if (textParts) {
154
+ textParts.push(content);
155
+ }
156
+ continue;
157
+ }
158
+ for (const rawSegment of content) {
159
+ if (!isJsonObject(rawSegment)) {
160
+ continue;
161
+ }
162
+ const segmentType = asString(rawSegment.type);
163
+ if (segmentType === "file") {
164
+ const rawValue = asString(rawSegment.value);
165
+ if (!rawValue) {
166
+ continue;
167
+ }
168
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
169
+ rawValue,
170
+ searchRoots
171
+ );
172
+ if (!resolvedPath) {
173
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
174
+ const context = messageType === "input" ? "" : " in expected_messages";
175
+ logWarning(`File not found${context}: ${displayPath}`, attempts);
176
+ continue;
177
+ }
178
+ try {
179
+ const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
180
+ if (messageType === "input" && guidelinePatterns && guidelinePaths) {
181
+ const relativeToRepo = path.relative(repoRootPath, resolvedPath);
182
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
183
+ guidelinePaths.push(path.resolve(resolvedPath));
184
+ if (verbose) {
185
+ console.log(` [Guideline] Found: ${displayPath}`);
186
+ console.log(` Resolved to: ${resolvedPath}`);
187
+ }
188
+ continue;
189
+ }
190
+ }
191
+ segments.push({
192
+ type: "file",
193
+ path: displayPath,
194
+ text: fileContent,
195
+ resolvedPath: path.resolve(resolvedPath)
196
+ });
197
+ if (verbose) {
198
+ const label = messageType === "input" ? "[File]" : "[Expected Output File]";
199
+ console.log(` ${label} Found: ${displayPath}`);
200
+ console.log(` Resolved to: ${resolvedPath}`);
201
+ }
202
+ } catch (error) {
203
+ const context = messageType === "input" ? "" : " expected output";
204
+ logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
205
+ }
206
+ continue;
207
+ }
208
+ const clonedSegment = cloneJsonObject(rawSegment);
209
+ segments.push(clonedSegment);
210
+ const inlineValue = clonedSegment.value;
211
+ if (typeof inlineValue === "string" && textParts) {
212
+ textParts.push(inlineValue);
213
+ }
214
+ }
215
+ }
216
+ return segments;
217
+ }
137
218
  async function loadEvalCases(evalFilePath, repoRoot, options) {
138
219
  const verbose = options?.verbose ?? false;
139
220
  const absoluteTestPath = path.resolve(evalFilePath);
@@ -219,77 +300,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
219
300
  }
220
301
  }
221
302
  }
222
- const userSegments = [];
223
303
  const guidelinePaths = [];
224
- const userTextParts = [];
225
- for (const userMessage of userMessages) {
226
- const content = userMessage.content;
227
- if (typeof content === "string") {
228
- userSegments.push({ type: "text", value: content });
229
- userTextParts.push(content);
230
- continue;
231
- }
232
- for (const rawSegment of content) {
233
- if (!isJsonObject(rawSegment)) {
234
- continue;
235
- }
236
- const segmentType = asString(rawSegment.type);
237
- if (segmentType === "file") {
238
- const rawValue = asString(rawSegment.value);
239
- if (!rawValue) {
240
- continue;
241
- }
242
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
243
- rawValue,
244
- searchRoots
245
- );
246
- if (!resolvedPath) {
247
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
248
- logWarning(`File not found: ${displayPath}`, attempts);
249
- continue;
250
- }
251
- try {
252
- const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
253
- const relativeToRepo = path.relative(repoRootPath, resolvedPath);
254
- if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
255
- guidelinePaths.push(path.resolve(resolvedPath));
256
- if (verbose) {
257
- console.log(` [Guideline] Found: ${displayPath}`);
258
- console.log(` Resolved to: ${resolvedPath}`);
259
- }
260
- } else {
261
- userSegments.push({
262
- type: "file",
263
- path: displayPath,
264
- text: fileContent,
265
- resolvedPath: path.resolve(resolvedPath)
266
- });
267
- if (verbose) {
268
- console.log(` [File] Found: ${displayPath}`);
269
- console.log(` Resolved to: ${resolvedPath}`);
270
- }
271
- }
272
- } catch (error) {
273
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
274
- }
275
- continue;
276
- }
277
- const clonedSegment = cloneJsonObject(rawSegment);
278
- userSegments.push(clonedSegment);
279
- const inlineValue = clonedSegment.value;
280
- if (typeof inlineValue === "string") {
281
- userTextParts.push(inlineValue);
282
- }
283
- }
284
- }
285
- const codeSnippets = extractCodeBlocks(userSegments);
304
+ const inputTextParts = [];
305
+ const inputSegments = await processMessages({
306
+ messages: userMessages,
307
+ searchRoots,
308
+ repoRootPath,
309
+ guidelinePatterns,
310
+ guidelinePaths,
311
+ textParts: inputTextParts,
312
+ messageType: "input",
313
+ verbose
314
+ });
315
+ const outputSegments = await processMessages({
316
+ messages: assistantMessages,
317
+ searchRoots,
318
+ repoRootPath,
319
+ guidelinePatterns,
320
+ messageType: "output",
321
+ verbose
322
+ });
323
+ const codeSnippets = extractCodeBlocks(inputSegments);
286
324
  const assistantContent = assistantMessages[0]?.content;
287
- const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
288
- const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
325
+ const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
326
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
289
327
  const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
290
328
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
291
329
  const userFilePaths = [];
292
- for (const segment of userSegments) {
330
+ for (const segment of inputSegments) {
293
331
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
294
332
  userFilePaths.push(segment.resolvedPath);
295
333
  }
@@ -302,15 +340,16 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
302
340
  id,
303
341
  dataset: datasetName,
304
342
  conversation_id: conversationId,
305
- task: userTextPrompt,
306
- user_segments: userSegments,
343
+ question,
344
+ input_segments: inputSegments,
345
+ output_segments: outputSegments,
307
346
  system_message: systemMessageContent,
308
- expected_assistant_raw: expectedAssistantRaw,
347
+ reference_answer: referenceAnswer,
309
348
  guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
310
349
  guideline_patterns: guidelinePatterns,
311
350
  file_paths: allFilePaths,
312
351
  code_snippets: codeSnippets,
313
- outcome,
352
+ expected_outcome: outcome,
314
353
  evaluator: testCaseEvaluatorKind,
315
354
  evaluators
316
355
  };
@@ -346,36 +385,36 @@ ${content}`);
346
385
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
347
386
  }
348
387
  }
349
- const requestParts = [];
350
- for (const segment of testCase.user_segments) {
388
+ const questionParts = [];
389
+ for (const segment of testCase.input_segments) {
351
390
  const typeValue = segment.type;
352
391
  if (typeof typeValue === "string" && typeValue === "file") {
353
392
  const pathValue = segment.path;
354
393
  const textValue = segment.text;
355
394
  const label = typeof pathValue === "string" ? pathValue : "file";
356
395
  const body = typeof textValue === "string" ? textValue : "";
357
- requestParts.push(`=== ${label} ===
396
+ questionParts.push(`=== ${label} ===
358
397
  ${body}`);
359
398
  continue;
360
399
  }
361
400
  if (typeof typeValue === "string" && typeValue === "text") {
362
401
  const value = segment.value;
363
402
  if (typeof value === "string") {
364
- requestParts.push(value);
403
+ questionParts.push(value);
365
404
  }
366
405
  continue;
367
406
  }
368
407
  const genericValue = segment.value;
369
408
  if (typeof genericValue === "string") {
370
- requestParts.push(genericValue);
409
+ questionParts.push(genericValue);
371
410
  }
372
411
  }
373
412
  if (testCase.code_snippets.length > 0) {
374
- requestParts.push(testCase.code_snippets.join("\n"));
413
+ questionParts.push(testCase.code_snippets.join("\n"));
375
414
  }
376
- const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
415
+ const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
377
416
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
378
- return { request, guidelines, systemMessage: testCase.system_message };
417
+ return { question, guidelines, systemMessage: testCase.system_message };
379
418
  }
380
419
  async function fileExists2(absolutePath) {
381
420
  try {
@@ -587,7 +626,7 @@ function buildChatPrompt(request) {
587
626
  ${request.guidelines.trim()}`);
588
627
  }
589
628
  const systemContent = systemSegments.join("\n\n");
590
- const userContent = request.prompt.trim();
629
+ const userContent = request.question.trim();
591
630
  const prompt = [
592
631
  {
593
632
  role: "system",
@@ -885,7 +924,7 @@ var CliProvider = class {
885
924
  healthcheck.commandTemplate,
886
925
  buildTemplateValues(
887
926
  {
888
- prompt: "",
927
+ question: "",
889
928
  guidelines: "",
890
929
  inputFiles: [],
891
930
  evalCaseId: "",
@@ -912,7 +951,7 @@ var CliProvider = class {
912
951
  function buildTemplateValues(request, config) {
913
952
  const inputFiles = normalizeInputFiles(request.inputFiles);
914
953
  return {
915
- PROMPT: shellEscape(request.prompt ?? ""),
954
+ PROMPT: shellEscape(request.question ?? ""),
916
955
  GUIDELINES: shellEscape(request.guidelines ?? ""),
917
956
  EVAL_ID: shellEscape(request.evalCaseId ?? ""),
918
957
  ATTEMPT: shellEscape(String(request.attempt ?? 0)),
@@ -976,6 +1015,59 @@ import { tmpdir } from "node:os";
976
1015
  import path4 from "node:path";
977
1016
  import { promisify as promisify2 } from "node:util";
978
1017
 
1018
+ // src/evaluation/providers/codex-log-tracker.ts
1019
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1020
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1021
+ function getCodexLogStore() {
1022
+ const globalObject = globalThis;
1023
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1024
+ if (existing) {
1025
+ return existing;
1026
+ }
1027
+ const created = [];
1028
+ globalObject[GLOBAL_LOGS_KEY] = created;
1029
+ return created;
1030
+ }
1031
+ function getSubscriberStore() {
1032
+ const globalObject = globalThis;
1033
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1034
+ if (existing) {
1035
+ return existing;
1036
+ }
1037
+ const created = /* @__PURE__ */ new Set();
1038
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1039
+ return created;
1040
+ }
1041
+ function notifySubscribers(entry) {
1042
+ const subscribers = Array.from(getSubscriberStore());
1043
+ for (const listener of subscribers) {
1044
+ try {
1045
+ listener(entry);
1046
+ } catch (error) {
1047
+ const message = error instanceof Error ? error.message : String(error);
1048
+ console.warn(`Codex log subscriber failed: ${message}`);
1049
+ }
1050
+ }
1051
+ }
1052
+ function recordCodexLogEntry(entry) {
1053
+ getCodexLogStore().push(entry);
1054
+ notifySubscribers(entry);
1055
+ }
1056
+ function consumeCodexLogEntries() {
1057
+ const store = getCodexLogStore();
1058
+ if (store.length === 0) {
1059
+ return [];
1060
+ }
1061
+ return store.splice(0, store.length);
1062
+ }
1063
+ function subscribeToCodexLogEntries(listener) {
1064
+ const store = getSubscriberStore();
1065
+ store.add(listener);
1066
+ return () => {
1067
+ store.delete(listener);
1068
+ };
1069
+ }
1070
+
979
1071
  // src/evaluation/providers/preread.ts
980
1072
  import path3 from "node:path";
981
1073
  function buildPromptDocument(request, inputFiles, options) {
@@ -993,7 +1085,7 @@ function buildPromptDocument(request, inputFiles, options) {
993
1085
  if (prereadBlock.length > 0) {
994
1086
  parts.push("\n", prereadBlock);
995
1087
  }
996
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1088
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
997
1089
  return parts.join("\n").trim();
998
1090
  }
999
1091
  function normalizeInputFiles2(inputFiles) {
@@ -1077,59 +1169,6 @@ function pathToFileUri(filePath) {
1077
1169
  return `file://${normalizedPath}`;
1078
1170
  }
1079
1171
 
1080
- // src/evaluation/providers/codex-log-tracker.ts
1081
- var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1082
- var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1083
- function getCodexLogStore() {
1084
- const globalObject = globalThis;
1085
- const existing = globalObject[GLOBAL_LOGS_KEY];
1086
- if (existing) {
1087
- return existing;
1088
- }
1089
- const created = [];
1090
- globalObject[GLOBAL_LOGS_KEY] = created;
1091
- return created;
1092
- }
1093
- function getSubscriberStore() {
1094
- const globalObject = globalThis;
1095
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1096
- if (existing) {
1097
- return existing;
1098
- }
1099
- const created = /* @__PURE__ */ new Set();
1100
- globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1101
- return created;
1102
- }
1103
- function notifySubscribers(entry) {
1104
- const subscribers = Array.from(getSubscriberStore());
1105
- for (const listener of subscribers) {
1106
- try {
1107
- listener(entry);
1108
- } catch (error) {
1109
- const message = error instanceof Error ? error.message : String(error);
1110
- console.warn(`Codex log subscriber failed: ${message}`);
1111
- }
1112
- }
1113
- }
1114
- function recordCodexLogEntry(entry) {
1115
- getCodexLogStore().push(entry);
1116
- notifySubscribers(entry);
1117
- }
1118
- function consumeCodexLogEntries() {
1119
- const store = getCodexLogStore();
1120
- if (store.length === 0) {
1121
- return [];
1122
- }
1123
- return store.splice(0, store.length);
1124
- }
1125
- function subscribeToCodexLogEntries(listener) {
1126
- const store = getSubscriberStore();
1127
- store.add(listener);
1128
- return () => {
1129
- store.delete(listener);
1130
- };
1131
- }
1132
-
1133
1172
  // src/evaluation/providers/codex.ts
1134
1173
  var execAsync2 = promisify2(execCallback);
1135
1174
  var WORKSPACE_PREFIX = "agentv-codex-";
@@ -1863,7 +1902,7 @@ var MockProvider = class {
1863
1902
  return {
1864
1903
  text: this.cannedResponse,
1865
1904
  raw: {
1866
- prompt: request.prompt,
1905
+ question: request.question,
1867
1906
  guidelines: request.guidelines
1868
1907
  }
1869
1908
  };
@@ -2479,7 +2518,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
2479
2518
  if (prereadBlock.length > 0) {
2480
2519
  parts.push("\n", prereadBlock);
2481
2520
  }
2482
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
2521
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2483
2522
  return parts.join("\n").trim();
2484
2523
  }
2485
2524
  function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
@@ -2716,30 +2755,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
2716
2755
  }
2717
2756
 
2718
2757
  // src/evaluation/evaluators.ts
2719
- import { ax, f } from "@ax-llm/ax";
2720
2758
  import { randomUUID as randomUUID2 } from "node:crypto";
2721
- var LLM_JUDGE_SIGNATURE = f().input(
2722
- "evaluationContext",
2723
- f.object(
2724
- {
2725
- expectedOutcome: f.string("The expected outcome for the original task"),
2726
- request: f.string("The original task request"),
2727
- referenceAnswer: f.string("The gold standard reference answer"),
2728
- generatedAnswer: f.string("The answer to evaluate"),
2729
- guidelines: f.string("Additional evaluation guidelines or instructions").optional()
2730
- },
2731
- "Complete evaluation context for the judge"
2732
- )
2733
- ).output(
2734
- "evaluation",
2735
- f.object({
2736
- score: f.number("Score between 0.0 and 1.0").min(0).max(1),
2737
- hits: f.string("Brief specific achievement").array(),
2738
- misses: f.string("Brief specific failure or omission").array(),
2739
- reasoning: f.string("Concise explanation for the score").max(500)
2740
- })
2741
- ).build();
2742
- var LLM_JUDGE = ax(LLM_JUDGE_SIGNATURE);
2743
2759
  var LlmJudgeEvaluator = class {
2744
2760
  kind = "llm_judge";
2745
2761
  resolveJudgeProvider;
@@ -2757,52 +2773,29 @@ var LlmJudgeEvaluator = class {
2757
2773
  if (!judgeProvider) {
2758
2774
  throw new Error("No judge provider available for LLM grading");
2759
2775
  }
2760
- if (providerSupportsAx(judgeProvider)) {
2761
- return this.evaluateWithAx(context, judgeProvider);
2762
- }
2763
2776
  return this.evaluateWithPrompt(context, judgeProvider);
2764
2777
  }
2765
- async evaluateWithAx(context, judgeProvider) {
2766
- const ai = judgeProvider.getAxAI();
2767
- const guidelines = context.promptInputs.guidelines?.trim();
2768
- const evaluationContext = {
2769
- expectedOutcome: context.evalCase.outcome.trim(),
2770
- request: context.evalCase.task.trim(),
2771
- referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
2772
- generatedAnswer: context.candidate.trim(),
2773
- ...guidelines ? { guidelines } : {}
2774
- };
2775
- const options = this.buildJudgeForwardOptions(context);
2776
- const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
2777
- const evaluation = result.evaluation;
2778
- const expectedAspectCount = Math.max(
2779
- evaluation.hits.length + evaluation.misses.length,
2780
- 1
2781
- );
2782
- return {
2783
- score: evaluation.score,
2784
- hits: evaluation.hits,
2785
- misses: evaluation.misses,
2786
- expectedAspectCount,
2787
- reasoning: evaluation.reasoning,
2788
- evaluatorRawRequest: {
2789
- id: randomUUID2(),
2790
- provider: judgeProvider.id,
2791
- target: context.target.name,
2792
- method: "ax-structured-output",
2793
- signature: LLM_JUDGE_SIGNATURE.toString()
2794
- }
2795
- };
2796
- }
2797
2778
  async evaluateWithPrompt(context, judgeProvider) {
2798
- const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2799
- const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2779
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2780
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2781
+ if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2782
+ const variables = {
2783
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2784
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2785
+ candidate_answer: context.candidate,
2786
+ reference_answer: context.evalCase.reference_answer,
2787
+ expected_outcome: context.evalCase.expected_outcome,
2788
+ question: context.evalCase.question
2789
+ };
2790
+ prompt = substituteVariables(systemPrompt, variables);
2791
+ systemPrompt = QUALITY_SYSTEM_PROMPT;
2792
+ }
2800
2793
  const metadata = {
2801
2794
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
2802
2795
  ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2803
2796
  };
2804
2797
  const response = await judgeProvider.invoke({
2805
- prompt,
2798
+ question: prompt,
2806
2799
  metadata,
2807
2800
  evalCaseId: context.evalCase.id,
2808
2801
  attempt: context.attempt,
@@ -2832,33 +2825,11 @@ var LlmJudgeEvaluator = class {
2832
2825
  evaluatorRawRequest
2833
2826
  };
2834
2827
  }
2835
- buildJudgeForwardOptions(context) {
2836
- const modelConfig = this.buildJudgeModelConfig();
2837
- if (modelConfig === void 0 && context.judgeModel === void 0) {
2838
- return void 0;
2839
- }
2840
- return {
2841
- ...context.judgeModel ? { model: context.judgeModel } : {},
2842
- ...modelConfig ? { modelConfig } : {}
2843
- };
2844
- }
2845
- buildJudgeModelConfig() {
2846
- if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
2847
- return void 0;
2848
- }
2849
- return {
2850
- ...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
2851
- ...this.temperature !== void 0 ? { temperature: this.temperature } : {}
2852
- };
2853
- }
2854
2828
  };
2855
- function providerSupportsAx(provider) {
2856
- return typeof provider.getAxAI === "function";
2857
- }
2858
2829
  var QUALITY_SYSTEM_PROMPT = [
2859
- "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
2830
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2860
2831
  "",
2861
- "Use the reference_answer as a gold standard for a high-quality response. The generated_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2832
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
2862
2833
  "",
2863
2834
  "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2864
2835
  "",
@@ -2871,18 +2842,18 @@ var QUALITY_SYSTEM_PROMPT = [
2871
2842
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2872
2843
  "}"
2873
2844
  ].join("\n");
2874
- function buildQualityPrompt(testCase, candidate) {
2845
+ function buildQualityPrompt(evalCase, candidate) {
2875
2846
  const parts = [
2876
2847
  "[[ ## expected_outcome ## ]]",
2877
- testCase.outcome.trim(),
2848
+ evalCase.expected_outcome.trim(),
2878
2849
  "",
2879
- "[[ ## request ## ]]",
2880
- testCase.task.trim(),
2850
+ "[[ ## question ## ]]",
2851
+ evalCase.question.trim(),
2881
2852
  "",
2882
2853
  "[[ ## reference_answer ## ]]",
2883
- testCase.expected_assistant_raw.trim(),
2854
+ evalCase.reference_answer.trim(),
2884
2855
  "",
2885
- "[[ ## generated_answer ## ]]",
2856
+ "[[ ## candidate_answer ## ]]",
2886
2857
  candidate.trim(),
2887
2858
  "",
2888
2859
  "Respond with a single JSON object matching the schema described in the system prompt."
@@ -2982,14 +2953,14 @@ var CodeEvaluator = class {
2982
2953
  async evaluate(context) {
2983
2954
  const inputPayload = JSON.stringify(
2984
2955
  {
2985
- task: context.evalCase.task,
2986
- outcome: context.evalCase.outcome,
2987
- expected: context.evalCase.expected_assistant_raw,
2988
- output: context.candidate,
2956
+ question: context.evalCase.question,
2957
+ expected_outcome: context.evalCase.expected_outcome,
2958
+ reference_answer: context.evalCase.reference_answer,
2959
+ candidate_answer: context.candidate,
2989
2960
  system_message: context.promptInputs.systemMessage ?? "",
2990
2961
  guideline_paths: context.evalCase.guideline_paths,
2991
- attachments: context.evalCase.file_paths,
2992
- user_segments: context.evalCase.user_segments
2962
+ input_files: context.evalCase.file_paths,
2963
+ input_segments: context.evalCase.input_segments
2993
2964
  },
2994
2965
  null,
2995
2966
  2
@@ -3075,6 +3046,14 @@ function parseJsonSafe(payload) {
3075
3046
  return void 0;
3076
3047
  }
3077
3048
  }
3049
+ function hasTemplateVariables(text) {
3050
+ return /\$\{[a-zA-Z0-9_]+\}/.test(text);
3051
+ }
3052
+ function substituteVariables(template, variables) {
3053
+ return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3054
+ return variables[varName] ?? match;
3055
+ });
3056
+ }
3078
3057
 
3079
3058
  // src/evaluation/orchestrator.ts
3080
3059
  import { createHash, randomUUID as randomUUID3 } from "node:crypto";
@@ -3431,7 +3410,7 @@ async function runBatchEvaluation(options) {
3431
3410
  const batchRequests = evalCases.map((evalCase, index) => {
3432
3411
  const promptInputs = promptInputsList[index];
3433
3412
  return {
3434
- prompt: promptInputs.request,
3413
+ question: promptInputs.question,
3435
3414
  guidelines: promptInputs.guidelines,
3436
3415
  guideline_patterns: evalCase.guideline_patterns,
3437
3416
  inputFiles: evalCase.file_paths,
@@ -3618,7 +3597,7 @@ async function evaluateCandidate(options) {
3618
3597
  });
3619
3598
  const completedAt = nowFn();
3620
3599
  const rawRequest = {
3621
- request: promptInputs.request,
3600
+ question: promptInputs.question,
3622
3601
  guidelines: promptInputs.guidelines,
3623
3602
  guideline_paths: evalCase.guideline_paths,
3624
3603
  system_message: promptInputs.systemMessage ?? ""
@@ -3630,7 +3609,7 @@ async function evaluateCandidate(options) {
3630
3609
  score: score.score,
3631
3610
  hits: score.hits,
3632
3611
  misses: score.misses,
3633
- model_answer: candidate,
3612
+ candidate_answer: candidate,
3634
3613
  expected_aspect_count: score.expectedAspectCount,
3635
3614
  target: target.name,
3636
3615
  timestamp: completedAt.toISOString(),
@@ -3840,7 +3819,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
3840
3819
  await mkdir2(path7.dirname(filePath), { recursive: true });
3841
3820
  const payload = {
3842
3821
  eval_id: evalCase.id,
3843
- request: promptInputs.request,
3822
+ question: promptInputs.question,
3844
3823
  guidelines: promptInputs.guidelines,
3845
3824
  guideline_paths: evalCase.guideline_paths
3846
3825
  };
@@ -3862,7 +3841,7 @@ async function invokeProvider(provider, options) {
3862
3841
  }
3863
3842
  try {
3864
3843
  return await provider.invoke({
3865
- prompt: promptInputs.request,
3844
+ question: promptInputs.question,
3866
3845
  guidelines: promptInputs.guidelines,
3867
3846
  guideline_patterns: evalCase.guideline_patterns,
3868
3847
  inputFiles: evalCase.file_paths,
@@ -3882,7 +3861,7 @@ async function invokeProvider(provider, options) {
3882
3861
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
3883
3862
  const message = error instanceof Error ? error.message : String(error);
3884
3863
  const rawRequest = {
3885
- request: promptInputs.request,
3864
+ question: promptInputs.question,
3886
3865
  guidelines: promptInputs.guidelines,
3887
3866
  guideline_paths: evalCase.guideline_paths,
3888
3867
  system_message: promptInputs.systemMessage ?? "",
@@ -3895,7 +3874,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
3895
3874
  score: 0,
3896
3875
  hits: [],
3897
3876
  misses: [`Error: ${message}`],
3898
- model_answer: `Error occurred: ${message}`,
3877
+ candidate_answer: `Error occurred: ${message}`,
3899
3878
  expected_aspect_count: 0,
3900
3879
  target: targetName,
3901
3880
  timestamp: timestamp.toISOString(),
@@ -3908,7 +3887,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
3908
3887
  hash.update(provider.id);
3909
3888
  hash.update(target.name);
3910
3889
  hash.update(evalCase.id);
3911
- hash.update(promptInputs.request);
3890
+ hash.update(promptInputs.question);
3912
3891
  hash.update(promptInputs.guidelines);
3913
3892
  hash.update(promptInputs.systemMessage ?? "");
3914
3893
  return hash.digest("hex");