@agentv/core 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -299,6 +299,87 @@ function extractCodeBlocks(segments) {
299
299
  }
300
300
  return codeBlocks;
301
301
  }
302
+ async function processMessages(options) {
303
+ const {
304
+ messages,
305
+ searchRoots,
306
+ repoRootPath,
307
+ guidelinePatterns,
308
+ guidelinePaths,
309
+ textParts,
310
+ messageType,
311
+ verbose
312
+ } = options;
313
+ const segments = [];
314
+ for (const message of messages) {
315
+ const content = message.content;
316
+ if (typeof content === "string") {
317
+ segments.push({ type: "text", value: content });
318
+ if (textParts) {
319
+ textParts.push(content);
320
+ }
321
+ continue;
322
+ }
323
+ for (const rawSegment of content) {
324
+ if (!isJsonObject(rawSegment)) {
325
+ continue;
326
+ }
327
+ const segmentType = asString(rawSegment.type);
328
+ if (segmentType === "file") {
329
+ const rawValue = asString(rawSegment.value);
330
+ if (!rawValue) {
331
+ continue;
332
+ }
333
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
334
+ rawValue,
335
+ searchRoots
336
+ );
337
+ if (!resolvedPath) {
338
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
339
+ const context = messageType === "input" ? "" : " in expected_messages";
340
+ logWarning(`File not found${context}: ${displayPath}`, attempts);
341
+ continue;
342
+ }
343
+ try {
344
+ const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
345
+ if (messageType === "input" && guidelinePatterns && guidelinePaths) {
346
+ const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
347
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
348
+ guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
349
+ if (verbose) {
350
+ console.log(` [Guideline] Found: ${displayPath}`);
351
+ console.log(` Resolved to: ${resolvedPath}`);
352
+ }
353
+ continue;
354
+ }
355
+ }
356
+ segments.push({
357
+ type: "file",
358
+ path: displayPath,
359
+ text: fileContent,
360
+ resolvedPath: import_node_path2.default.resolve(resolvedPath)
361
+ });
362
+ if (verbose) {
363
+ const label = messageType === "input" ? "[File]" : "[Expected Output File]";
364
+ console.log(` ${label} Found: ${displayPath}`);
365
+ console.log(` Resolved to: ${resolvedPath}`);
366
+ }
367
+ } catch (error) {
368
+ const context = messageType === "input" ? "" : " expected output";
369
+ logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
370
+ }
371
+ continue;
372
+ }
373
+ const clonedSegment = cloneJsonObject(rawSegment);
374
+ segments.push(clonedSegment);
375
+ const inlineValue = clonedSegment.value;
376
+ if (typeof inlineValue === "string" && textParts) {
377
+ textParts.push(inlineValue);
378
+ }
379
+ }
380
+ }
381
+ return segments;
382
+ }
302
383
  async function loadEvalCases(evalFilePath, repoRoot, options) {
303
384
  const verbose = options?.verbose ?? false;
304
385
  const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
@@ -384,77 +465,34 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
384
465
  }
385
466
  }
386
467
  }
387
- const userSegments = [];
388
468
  const guidelinePaths = [];
389
- const userTextParts = [];
390
- for (const userMessage of userMessages) {
391
- const content = userMessage.content;
392
- if (typeof content === "string") {
393
- userSegments.push({ type: "text", value: content });
394
- userTextParts.push(content);
395
- continue;
396
- }
397
- for (const rawSegment of content) {
398
- if (!isJsonObject(rawSegment)) {
399
- continue;
400
- }
401
- const segmentType = asString(rawSegment.type);
402
- if (segmentType === "file") {
403
- const rawValue = asString(rawSegment.value);
404
- if (!rawValue) {
405
- continue;
406
- }
407
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
408
- rawValue,
409
- searchRoots
410
- );
411
- if (!resolvedPath) {
412
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
413
- logWarning(`File not found: ${displayPath}`, attempts);
414
- continue;
415
- }
416
- try {
417
- const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
418
- const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
419
- if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
420
- guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
421
- if (verbose) {
422
- console.log(` [Guideline] Found: ${displayPath}`);
423
- console.log(` Resolved to: ${resolvedPath}`);
424
- }
425
- } else {
426
- userSegments.push({
427
- type: "file",
428
- path: displayPath,
429
- text: fileContent,
430
- resolvedPath: import_node_path2.default.resolve(resolvedPath)
431
- });
432
- if (verbose) {
433
- console.log(` [File] Found: ${displayPath}`);
434
- console.log(` Resolved to: ${resolvedPath}`);
435
- }
436
- }
437
- } catch (error) {
438
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
439
- }
440
- continue;
441
- }
442
- const clonedSegment = cloneJsonObject(rawSegment);
443
- userSegments.push(clonedSegment);
444
- const inlineValue = clonedSegment.value;
445
- if (typeof inlineValue === "string") {
446
- userTextParts.push(inlineValue);
447
- }
448
- }
449
- }
450
- const codeSnippets = extractCodeBlocks(userSegments);
469
+ const inputTextParts = [];
470
+ const inputSegments = await processMessages({
471
+ messages: userMessages,
472
+ searchRoots,
473
+ repoRootPath,
474
+ guidelinePatterns,
475
+ guidelinePaths,
476
+ textParts: inputTextParts,
477
+ messageType: "input",
478
+ verbose
479
+ });
480
+ const outputSegments = await processMessages({
481
+ messages: assistantMessages,
482
+ searchRoots,
483
+ repoRootPath,
484
+ guidelinePatterns,
485
+ messageType: "output",
486
+ verbose
487
+ });
488
+ const codeSnippets = extractCodeBlocks(inputSegments);
451
489
  const assistantContent = assistantMessages[0]?.content;
452
- const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
453
- const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
490
+ const referenceAnswer = await resolveAssistantContent(assistantContent, searchRoots, verbose);
491
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
454
492
  const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
455
493
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
456
494
  const userFilePaths = [];
457
- for (const segment of userSegments) {
495
+ for (const segment of inputSegments) {
458
496
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
459
497
  userFilePaths.push(segment.resolvedPath);
460
498
  }
@@ -467,15 +505,16 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
467
505
  id,
468
506
  dataset: datasetName,
469
507
  conversation_id: conversationId,
470
- task: userTextPrompt,
471
- user_segments: userSegments,
508
+ question,
509
+ input_segments: inputSegments,
510
+ output_segments: outputSegments,
472
511
  system_message: systemMessageContent,
473
- expected_assistant_raw: expectedAssistantRaw,
512
+ reference_answer: referenceAnswer,
474
513
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
475
514
  guideline_patterns: guidelinePatterns,
476
515
  file_paths: allFilePaths,
477
516
  code_snippets: codeSnippets,
478
- outcome,
517
+ expected_outcome: outcome,
479
518
  evaluator: testCaseEvaluatorKind,
480
519
  evaluators
481
520
  };
@@ -511,36 +550,36 @@ ${content}`);
511
550
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
512
551
  }
513
552
  }
514
- const requestParts = [];
515
- for (const segment of testCase.user_segments) {
553
+ const questionParts = [];
554
+ for (const segment of testCase.input_segments) {
516
555
  const typeValue = segment.type;
517
556
  if (typeof typeValue === "string" && typeValue === "file") {
518
557
  const pathValue = segment.path;
519
558
  const textValue = segment.text;
520
559
  const label = typeof pathValue === "string" ? pathValue : "file";
521
560
  const body = typeof textValue === "string" ? textValue : "";
522
- requestParts.push(`=== ${label} ===
561
+ questionParts.push(`=== ${label} ===
523
562
  ${body}`);
524
563
  continue;
525
564
  }
526
565
  if (typeof typeValue === "string" && typeValue === "text") {
527
566
  const value = segment.value;
528
567
  if (typeof value === "string") {
529
- requestParts.push(value);
568
+ questionParts.push(value);
530
569
  }
531
570
  continue;
532
571
  }
533
572
  const genericValue = segment.value;
534
573
  if (typeof genericValue === "string") {
535
- requestParts.push(genericValue);
574
+ questionParts.push(genericValue);
536
575
  }
537
576
  }
538
577
  if (testCase.code_snippets.length > 0) {
539
- requestParts.push(testCase.code_snippets.join("\n"));
578
+ questionParts.push(testCase.code_snippets.join("\n"));
540
579
  }
541
- const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
580
+ const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
542
581
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
543
- return { request, guidelines, systemMessage: testCase.system_message };
582
+ return { question, guidelines, systemMessage: testCase.system_message };
544
583
  }
545
584
  async function fileExists2(absolutePath) {
546
585
  try {
@@ -752,7 +791,7 @@ function buildChatPrompt(request) {
752
791
  ${request.guidelines.trim()}`);
753
792
  }
754
793
  const systemContent = systemSegments.join("\n\n");
755
- const userContent = request.prompt.trim();
794
+ const userContent = request.question.trim();
756
795
  const prompt = [
757
796
  {
758
797
  role: "system",
@@ -1050,7 +1089,7 @@ var CliProvider = class {
1050
1089
  healthcheck.commandTemplate,
1051
1090
  buildTemplateValues(
1052
1091
  {
1053
- prompt: "",
1092
+ question: "",
1054
1093
  guidelines: "",
1055
1094
  inputFiles: [],
1056
1095
  evalCaseId: "",
@@ -1077,7 +1116,7 @@ var CliProvider = class {
1077
1116
  function buildTemplateValues(request, config) {
1078
1117
  const inputFiles = normalizeInputFiles(request.inputFiles);
1079
1118
  return {
1080
- PROMPT: shellEscape(request.prompt ?? ""),
1119
+ PROMPT: shellEscape(request.question ?? ""),
1081
1120
  GUIDELINES: shellEscape(request.guidelines ?? ""),
1082
1121
  EVAL_ID: shellEscape(request.evalCaseId ?? ""),
1083
1122
  ATTEMPT: shellEscape(String(request.attempt ?? 0)),
@@ -1141,6 +1180,59 @@ var import_node_os = require("os");
1141
1180
  var import_node_path5 = __toESM(require("path"), 1);
1142
1181
  var import_node_util2 = require("util");
1143
1182
 
1183
+ // src/evaluation/providers/codex-log-tracker.ts
1184
+ var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1185
+ var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1186
+ function getCodexLogStore() {
1187
+ const globalObject = globalThis;
1188
+ const existing = globalObject[GLOBAL_LOGS_KEY];
1189
+ if (existing) {
1190
+ return existing;
1191
+ }
1192
+ const created = [];
1193
+ globalObject[GLOBAL_LOGS_KEY] = created;
1194
+ return created;
1195
+ }
1196
+ function getSubscriberStore() {
1197
+ const globalObject = globalThis;
1198
+ const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1199
+ if (existing) {
1200
+ return existing;
1201
+ }
1202
+ const created = /* @__PURE__ */ new Set();
1203
+ globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1204
+ return created;
1205
+ }
1206
+ function notifySubscribers(entry) {
1207
+ const subscribers = Array.from(getSubscriberStore());
1208
+ for (const listener of subscribers) {
1209
+ try {
1210
+ listener(entry);
1211
+ } catch (error) {
1212
+ const message = error instanceof Error ? error.message : String(error);
1213
+ console.warn(`Codex log subscriber failed: ${message}`);
1214
+ }
1215
+ }
1216
+ }
1217
+ function recordCodexLogEntry(entry) {
1218
+ getCodexLogStore().push(entry);
1219
+ notifySubscribers(entry);
1220
+ }
1221
+ function consumeCodexLogEntries() {
1222
+ const store = getCodexLogStore();
1223
+ if (store.length === 0) {
1224
+ return [];
1225
+ }
1226
+ return store.splice(0, store.length);
1227
+ }
1228
+ function subscribeToCodexLogEntries(listener) {
1229
+ const store = getSubscriberStore();
1230
+ store.add(listener);
1231
+ return () => {
1232
+ store.delete(listener);
1233
+ };
1234
+ }
1235
+
1144
1236
  // src/evaluation/providers/preread.ts
1145
1237
  var import_node_path4 = __toESM(require("path"), 1);
1146
1238
  function buildPromptDocument(request, inputFiles, options) {
@@ -1158,7 +1250,7 @@ function buildPromptDocument(request, inputFiles, options) {
1158
1250
  if (prereadBlock.length > 0) {
1159
1251
  parts.push("\n", prereadBlock);
1160
1252
  }
1161
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1253
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
1162
1254
  return parts.join("\n").trim();
1163
1255
  }
1164
1256
  function normalizeInputFiles2(inputFiles) {
@@ -1242,59 +1334,6 @@ function pathToFileUri(filePath) {
1242
1334
  return `file://${normalizedPath}`;
1243
1335
  }
1244
1336
 
1245
- // src/evaluation/providers/codex-log-tracker.ts
1246
- var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
1247
- var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
1248
- function getCodexLogStore() {
1249
- const globalObject = globalThis;
1250
- const existing = globalObject[GLOBAL_LOGS_KEY];
1251
- if (existing) {
1252
- return existing;
1253
- }
1254
- const created = [];
1255
- globalObject[GLOBAL_LOGS_KEY] = created;
1256
- return created;
1257
- }
1258
- function getSubscriberStore() {
1259
- const globalObject = globalThis;
1260
- const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
1261
- if (existing) {
1262
- return existing;
1263
- }
1264
- const created = /* @__PURE__ */ new Set();
1265
- globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
1266
- return created;
1267
- }
1268
- function notifySubscribers(entry) {
1269
- const subscribers = Array.from(getSubscriberStore());
1270
- for (const listener of subscribers) {
1271
- try {
1272
- listener(entry);
1273
- } catch (error) {
1274
- const message = error instanceof Error ? error.message : String(error);
1275
- console.warn(`Codex log subscriber failed: ${message}`);
1276
- }
1277
- }
1278
- }
1279
- function recordCodexLogEntry(entry) {
1280
- getCodexLogStore().push(entry);
1281
- notifySubscribers(entry);
1282
- }
1283
- function consumeCodexLogEntries() {
1284
- const store = getCodexLogStore();
1285
- if (store.length === 0) {
1286
- return [];
1287
- }
1288
- return store.splice(0, store.length);
1289
- }
1290
- function subscribeToCodexLogEntries(listener) {
1291
- const store = getSubscriberStore();
1292
- store.add(listener);
1293
- return () => {
1294
- store.delete(listener);
1295
- };
1296
- }
1297
-
1298
1337
  // src/evaluation/providers/codex.ts
1299
1338
  var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
1300
1339
  var WORKSPACE_PREFIX = "agentv-codex-";
@@ -2028,7 +2067,7 @@ var MockProvider = class {
2028
2067
  return {
2029
2068
  text: this.cannedResponse,
2030
2069
  raw: {
2031
- prompt: request.prompt,
2070
+ question: request.question,
2032
2071
  guidelines: request.guidelines
2033
2072
  }
2034
2073
  };
@@ -2644,7 +2683,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
2644
2683
  if (prereadBlock.length > 0) {
2645
2684
  parts.push("\n", prereadBlock);
2646
2685
  }
2647
- parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
2686
+ parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
2648
2687
  return parts.join("\n").trim();
2649
2688
  }
2650
2689
  function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
@@ -2886,30 +2925,7 @@ function resolveAndCreateProvider(definition, env = process.env) {
2886
2925
  }
2887
2926
 
2888
2927
  // src/evaluation/evaluators.ts
2889
- var import_ax3 = require("@ax-llm/ax");
2890
2928
  var import_node_crypto2 = require("crypto");
2891
- var LLM_JUDGE_SIGNATURE = (0, import_ax3.f)().input(
2892
- "evaluationContext",
2893
- import_ax3.f.object(
2894
- {
2895
- expectedOutcome: import_ax3.f.string("The expected outcome for the original task"),
2896
- request: import_ax3.f.string("The original task request"),
2897
- referenceAnswer: import_ax3.f.string("The gold standard reference answer"),
2898
- generatedAnswer: import_ax3.f.string("The answer to evaluate"),
2899
- guidelines: import_ax3.f.string("Additional evaluation guidelines or instructions").optional()
2900
- },
2901
- "Complete evaluation context for the judge"
2902
- )
2903
- ).output(
2904
- "evaluation",
2905
- import_ax3.f.object({
2906
- score: import_ax3.f.number("Score between 0.0 and 1.0").min(0).max(1),
2907
- hits: import_ax3.f.string("Brief specific achievement").array(),
2908
- misses: import_ax3.f.string("Brief specific failure or omission").array(),
2909
- reasoning: import_ax3.f.string("Concise explanation for the score").max(500)
2910
- })
2911
- ).build();
2912
- var LLM_JUDGE = (0, import_ax3.ax)(LLM_JUDGE_SIGNATURE);
2913
2929
  var LlmJudgeEvaluator = class {
2914
2930
  kind = "llm_judge";
2915
2931
  resolveJudgeProvider;
@@ -2927,52 +2943,29 @@ var LlmJudgeEvaluator = class {
2927
2943
  if (!judgeProvider) {
2928
2944
  throw new Error("No judge provider available for LLM grading");
2929
2945
  }
2930
- if (providerSupportsAx(judgeProvider)) {
2931
- return this.evaluateWithAx(context, judgeProvider);
2932
- }
2933
2946
  return this.evaluateWithPrompt(context, judgeProvider);
2934
2947
  }
2935
- async evaluateWithAx(context, judgeProvider) {
2936
- const ai = judgeProvider.getAxAI();
2937
- const guidelines = context.promptInputs.guidelines?.trim();
2938
- const evaluationContext = {
2939
- expectedOutcome: context.evalCase.outcome.trim(),
2940
- request: context.evalCase.task.trim(),
2941
- referenceAnswer: context.evalCase.expected_assistant_raw.trim(),
2942
- generatedAnswer: context.candidate.trim(),
2943
- ...guidelines ? { guidelines } : {}
2944
- };
2945
- const options = this.buildJudgeForwardOptions(context);
2946
- const result = await LLM_JUDGE.forward(ai, { evaluationContext }, options);
2947
- const evaluation = result.evaluation;
2948
- const expectedAspectCount = Math.max(
2949
- evaluation.hits.length + evaluation.misses.length,
2950
- 1
2951
- );
2952
- return {
2953
- score: evaluation.score,
2954
- hits: evaluation.hits,
2955
- misses: evaluation.misses,
2956
- expectedAspectCount,
2957
- reasoning: evaluation.reasoning,
2958
- evaluatorRawRequest: {
2959
- id: (0, import_node_crypto2.randomUUID)(),
2960
- provider: judgeProvider.id,
2961
- target: context.target.name,
2962
- method: "ax-structured-output",
2963
- signature: LLM_JUDGE_SIGNATURE.toString()
2964
- }
2965
- };
2966
- }
2967
2948
  async evaluateWithPrompt(context, judgeProvider) {
2968
- const prompt = buildQualityPrompt(context.evalCase, context.candidate);
2969
- const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2949
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate);
2950
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
2951
+ if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2952
+ const variables = {
2953
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2954
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2955
+ candidate_answer: context.candidate,
2956
+ reference_answer: context.evalCase.reference_answer,
2957
+ expected_outcome: context.evalCase.expected_outcome,
2958
+ question: context.evalCase.question
2959
+ };
2960
+ prompt = substituteVariables(systemPrompt, variables);
2961
+ systemPrompt = QUALITY_SYSTEM_PROMPT;
2962
+ }
2970
2963
  const metadata = {
2971
2964
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
2972
2965
  ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
2973
2966
  };
2974
2967
  const response = await judgeProvider.invoke({
2975
- prompt,
2968
+ question: prompt,
2976
2969
  metadata,
2977
2970
  evalCaseId: context.evalCase.id,
2978
2971
  attempt: context.attempt,
@@ -3002,33 +2995,11 @@ var LlmJudgeEvaluator = class {
3002
2995
  evaluatorRawRequest
3003
2996
  };
3004
2997
  }
3005
- buildJudgeForwardOptions(context) {
3006
- const modelConfig = this.buildJudgeModelConfig();
3007
- if (modelConfig === void 0 && context.judgeModel === void 0) {
3008
- return void 0;
3009
- }
3010
- return {
3011
- ...context.judgeModel ? { model: context.judgeModel } : {},
3012
- ...modelConfig ? { modelConfig } : {}
3013
- };
3014
- }
3015
- buildJudgeModelConfig() {
3016
- if (this.maxOutputTokens === void 0 && this.temperature === void 0) {
3017
- return void 0;
3018
- }
3019
- return {
3020
- ...this.maxOutputTokens !== void 0 ? { maxTokens: this.maxOutputTokens } : {},
3021
- ...this.temperature !== void 0 ? { temperature: this.temperature } : {}
3022
- };
3023
- }
3024
2998
  };
3025
- function providerSupportsAx(provider) {
3026
- return typeof provider.getAxAI === "function";
3027
- }
3028
2999
  var QUALITY_SYSTEM_PROMPT = [
3029
- "You are an expert evaluator. Your goal is to grade the generated_answer based on how well it achieves the expected_outcome for the original task.",
3000
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3030
3001
  "",
3031
- "Use the reference_answer as a gold standard for a high-quality response. The generated_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
3002
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
3032
3003
  "",
3033
3004
  "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3034
3005
  "",
@@ -3041,18 +3012,18 @@ var QUALITY_SYSTEM_PROMPT = [
3041
3012
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3042
3013
  "}"
3043
3014
  ].join("\n");
3044
- function buildQualityPrompt(testCase, candidate) {
3015
+ function buildQualityPrompt(evalCase, candidate) {
3045
3016
  const parts = [
3046
3017
  "[[ ## expected_outcome ## ]]",
3047
- testCase.outcome.trim(),
3018
+ evalCase.expected_outcome.trim(),
3048
3019
  "",
3049
- "[[ ## request ## ]]",
3050
- testCase.task.trim(),
3020
+ "[[ ## question ## ]]",
3021
+ evalCase.question.trim(),
3051
3022
  "",
3052
3023
  "[[ ## reference_answer ## ]]",
3053
- testCase.expected_assistant_raw.trim(),
3024
+ evalCase.reference_answer.trim(),
3054
3025
  "",
3055
- "[[ ## generated_answer ## ]]",
3026
+ "[[ ## candidate_answer ## ]]",
3056
3027
  candidate.trim(),
3057
3028
  "",
3058
3029
  "Respond with a single JSON object matching the schema described in the system prompt."
@@ -3152,14 +3123,14 @@ var CodeEvaluator = class {
3152
3123
  async evaluate(context) {
3153
3124
  const inputPayload = JSON.stringify(
3154
3125
  {
3155
- task: context.evalCase.task,
3156
- outcome: context.evalCase.outcome,
3157
- expected: context.evalCase.expected_assistant_raw,
3158
- output: context.candidate,
3126
+ question: context.evalCase.question,
3127
+ expected_outcome: context.evalCase.expected_outcome,
3128
+ reference_answer: context.evalCase.reference_answer,
3129
+ candidate_answer: context.candidate,
3159
3130
  system_message: context.promptInputs.systemMessage ?? "",
3160
3131
  guideline_paths: context.evalCase.guideline_paths,
3161
- attachments: context.evalCase.file_paths,
3162
- user_segments: context.evalCase.user_segments
3132
+ input_files: context.evalCase.file_paths,
3133
+ input_segments: context.evalCase.input_segments
3163
3134
  },
3164
3135
  null,
3165
3136
  2
@@ -3245,6 +3216,14 @@ function parseJsonSafe(payload) {
3245
3216
  return void 0;
3246
3217
  }
3247
3218
  }
3219
+ function hasTemplateVariables(text) {
3220
+ return /\$\{[a-zA-Z0-9_]+\}/.test(text);
3221
+ }
3222
+ function substituteVariables(template, variables) {
3223
+ return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3224
+ return variables[varName] ?? match;
3225
+ });
3226
+ }
3248
3227
 
3249
3228
  // src/evaluation/orchestrator.ts
3250
3229
  var import_node_crypto3 = require("crypto");
@@ -3601,7 +3580,7 @@ async function runBatchEvaluation(options) {
3601
3580
  const batchRequests = evalCases.map((evalCase, index) => {
3602
3581
  const promptInputs = promptInputsList[index];
3603
3582
  return {
3604
- prompt: promptInputs.request,
3583
+ question: promptInputs.question,
3605
3584
  guidelines: promptInputs.guidelines,
3606
3585
  guideline_patterns: evalCase.guideline_patterns,
3607
3586
  inputFiles: evalCase.file_paths,
@@ -3788,7 +3767,7 @@ async function evaluateCandidate(options) {
3788
3767
  });
3789
3768
  const completedAt = nowFn();
3790
3769
  const rawRequest = {
3791
- request: promptInputs.request,
3770
+ question: promptInputs.question,
3792
3771
  guidelines: promptInputs.guidelines,
3793
3772
  guideline_paths: evalCase.guideline_paths,
3794
3773
  system_message: promptInputs.systemMessage ?? ""
@@ -3800,7 +3779,7 @@ async function evaluateCandidate(options) {
3800
3779
  score: score.score,
3801
3780
  hits: score.hits,
3802
3781
  misses: score.misses,
3803
- model_answer: candidate,
3782
+ candidate_answer: candidate,
3804
3783
  expected_aspect_count: score.expectedAspectCount,
3805
3784
  target: target.name,
3806
3785
  timestamp: completedAt.toISOString(),
@@ -4010,7 +3989,7 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
4010
3989
  await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
4011
3990
  const payload = {
4012
3991
  eval_id: evalCase.id,
4013
- request: promptInputs.request,
3992
+ question: promptInputs.question,
4014
3993
  guidelines: promptInputs.guidelines,
4015
3994
  guideline_paths: evalCase.guideline_paths
4016
3995
  };
@@ -4032,7 +4011,7 @@ async function invokeProvider(provider, options) {
4032
4011
  }
4033
4012
  try {
4034
4013
  return await provider.invoke({
4035
- prompt: promptInputs.request,
4014
+ question: promptInputs.question,
4036
4015
  guidelines: promptInputs.guidelines,
4037
4016
  guideline_patterns: evalCase.guideline_patterns,
4038
4017
  inputFiles: evalCase.file_paths,
@@ -4052,7 +4031,7 @@ async function invokeProvider(provider, options) {
4052
4031
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
4053
4032
  const message = error instanceof Error ? error.message : String(error);
4054
4033
  const rawRequest = {
4055
- request: promptInputs.request,
4034
+ question: promptInputs.question,
4056
4035
  guidelines: promptInputs.guidelines,
4057
4036
  guideline_paths: evalCase.guideline_paths,
4058
4037
  system_message: promptInputs.systemMessage ?? "",
@@ -4065,7 +4044,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs)
4065
4044
  score: 0,
4066
4045
  hits: [],
4067
4046
  misses: [`Error: ${message}`],
4068
- model_answer: `Error occurred: ${message}`,
4047
+ candidate_answer: `Error occurred: ${message}`,
4069
4048
  expected_aspect_count: 0,
4070
4049
  target: targetName,
4071
4050
  timestamp: timestamp.toISOString(),
@@ -4078,7 +4057,7 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
4078
4057
  hash.update(provider.id);
4079
4058
  hash.update(target.name);
4080
4059
  hash.update(evalCase.id);
4081
- hash.update(promptInputs.request);
4060
+ hash.update(promptInputs.question);
4082
4061
  hash.update(promptInputs.guidelines);
4083
4062
  hash.update(promptInputs.systemMessage ?? "");
4084
4063
  return hash.digest("hex");