@agentv/core 0.22.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -51,7 +51,7 @@ function isTestMessage(value) {
51
51
  }
52
52
  return candidate.content.every(isJsonObject);
53
53
  }
54
- var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
54
+ var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
55
55
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
56
56
  function isEvaluatorKind(value) {
57
57
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -403,10 +403,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
403
403
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
404
404
  continue;
405
405
  }
406
- if (typeValue === "code") {
406
+ if (typeValue === "code_judge") {
407
407
  const script = asString2(rawEvaluator.script);
408
408
  if (!script) {
409
- logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
409
+ logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
410
410
  continue;
411
411
  }
412
412
  const cwd = asString2(rawEvaluator.cwd);
@@ -417,7 +417,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
417
417
  resolvedCwd = path3.resolve(resolved.resolvedPath);
418
418
  } else {
419
419
  logWarning2(
420
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
420
+ `Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
421
421
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
422
422
  );
423
423
  }
@@ -433,6 +433,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
433
433
  });
434
434
  continue;
435
435
  }
436
+ if (typeValue === "composite") {
437
+ const rawMembers = rawEvaluator.evaluators;
438
+ if (!Array.isArray(rawMembers)) {
439
+ logWarning2(
440
+ `Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
441
+ );
442
+ continue;
443
+ }
444
+ const rawAggregator = rawEvaluator.aggregator;
445
+ if (!isJsonObject2(rawAggregator)) {
446
+ logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
447
+ continue;
448
+ }
449
+ const aggregatorType = asString2(rawAggregator.type);
450
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
451
+ logWarning2(
452
+ `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
453
+ );
454
+ continue;
455
+ }
456
+ const memberEvaluators = [];
457
+ for (const rawMember of rawMembers) {
458
+ if (!isJsonObject2(rawMember)) {
459
+ logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
460
+ continue;
461
+ }
462
+ const memberName = asString2(rawMember.name);
463
+ const memberType = rawMember.type;
464
+ if (!memberName || !isEvaluatorKind(memberType)) {
465
+ logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
466
+ continue;
467
+ }
468
+ const memberConfigs = await parseEvaluators(
469
+ { evaluators: [rawMember] },
470
+ void 0,
471
+ searchRoots,
472
+ `${evalId}:${name}:${memberName}`
473
+ );
474
+ if (memberConfigs && memberConfigs.length > 0) {
475
+ memberEvaluators.push(memberConfigs[0]);
476
+ }
477
+ }
478
+ if (memberEvaluators.length === 0) {
479
+ logWarning2(
480
+ `Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
481
+ );
482
+ continue;
483
+ }
484
+ let aggregator;
485
+ if (aggregatorType === "weighted_average") {
486
+ const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
487
+ const parsedWeights = {};
488
+ if (weights) {
489
+ for (const [key, value] of Object.entries(weights)) {
490
+ if (typeof value === "number") {
491
+ parsedWeights[key] = value;
492
+ }
493
+ }
494
+ }
495
+ aggregator = {
496
+ type: "weighted_average",
497
+ ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
498
+ };
499
+ } else if (aggregatorType === "code_judge") {
500
+ const aggregatorPath = asString2(rawAggregator.path);
501
+ if (!aggregatorPath) {
502
+ logWarning2(
503
+ `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
504
+ );
505
+ continue;
506
+ }
507
+ aggregator = {
508
+ type: "code_judge",
509
+ path: aggregatorPath,
510
+ cwd: searchRoots[0]
511
+ };
512
+ } else {
513
+ const aggregatorPrompt = asString2(rawAggregator.prompt);
514
+ let promptPath2;
515
+ if (aggregatorPrompt) {
516
+ const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
517
+ if (resolved.resolvedPath) {
518
+ promptPath2 = path3.resolve(resolved.resolvedPath);
519
+ }
520
+ }
521
+ aggregator = {
522
+ type: "llm_judge",
523
+ ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
524
+ ...promptPath2 ? { promptPath: promptPath2 } : {}
525
+ };
526
+ }
527
+ evaluators.push({
528
+ name,
529
+ type: "composite",
530
+ evaluators: memberEvaluators,
531
+ aggregator
532
+ });
533
+ continue;
534
+ }
436
535
  const prompt = asString2(rawEvaluator.prompt);
437
536
  let promptPath;
438
537
  if (prompt) {
@@ -453,25 +552,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
453
552
  }
454
553
  }
455
554
  const _model = asString2(rawEvaluator.model);
555
+ const rawRubrics = rawEvaluator.rubrics;
556
+ const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
557
+ id: asString2(rubric.id) ?? `rubric-${index + 1}`,
558
+ description: asString2(rubric.description) ?? "",
559
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
560
+ required: typeof rubric.required === "boolean" ? rubric.required : true
561
+ })).filter((r) => r.description.length > 0) : void 0;
456
562
  if (typeValue === "rubric") {
457
- const rubrics = rawEvaluator.rubrics;
458
- if (!Array.isArray(rubrics)) {
563
+ if (!parsedRubrics) {
459
564
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
460
565
  continue;
461
566
  }
462
- const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
463
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
464
- description: asString2(rubric.description) ?? "",
465
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
466
- required: typeof rubric.required === "boolean" ? rubric.required : true
467
- })).filter((r) => r.description.length > 0);
468
567
  if (parsedRubrics.length === 0) {
469
568
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
470
569
  continue;
471
570
  }
472
571
  evaluators.push({
473
572
  name,
474
- type: "rubric",
573
+ type: "llm_judge",
475
574
  rubrics: parsedRubrics
476
575
  });
477
576
  continue;
@@ -480,7 +579,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
480
579
  name,
481
580
  type: "llm_judge",
482
581
  prompt,
483
- promptPath
582
+ promptPath,
583
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
484
584
  });
485
585
  }
486
586
  return evaluators.length > 0 ? evaluators : void 0;
@@ -1031,7 +1131,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1031
1131
  if (rubricItems.length > 0) {
1032
1132
  const rubricEvaluator = {
1033
1133
  name: "rubric",
1034
- type: "rubric",
1134
+ type: "llm_judge",
1035
1135
  rubrics: rubricItems
1036
1136
  };
1037
1137
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
@@ -2928,149 +3028,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
2928
3028
  return createProvider(resolved);
2929
3029
  }
2930
3030
 
2931
- // src/evaluation/evaluators/rubric-evaluator.ts
3031
+ // src/evaluation/evaluators.ts
2932
3032
  import { generateText as generateText2 } from "ai";
2933
3033
  import { z } from "zod";
2934
- var rubricCheckResultSchema = z.object({
2935
- id: z.string().describe("The ID of the rubric item being checked"),
2936
- satisfied: z.boolean().describe("Whether this rubric requirement is met"),
2937
- reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
2938
- });
2939
- var rubricEvaluationSchema = z.object({
2940
- checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
2941
- overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
2942
- });
2943
- var RubricEvaluator = class {
2944
- kind = "rubric";
2945
- config;
2946
- resolveJudgeProvider;
2947
- constructor(options) {
2948
- this.config = options.config;
2949
- this.resolveJudgeProvider = options.resolveJudgeProvider;
2950
- }
2951
- async evaluate(context) {
2952
- const judgeProvider = await this.resolveJudgeProvider(context);
2953
- if (!judgeProvider) {
2954
- throw new Error("No judge provider available for rubric evaluation");
2955
- }
2956
- if (!this.config.rubrics || this.config.rubrics.length === 0) {
2957
- throw new Error(
2958
- `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
2959
- );
2960
- }
2961
- const prompt = this.buildPrompt(context, this.config.rubrics);
2962
- const model = judgeProvider.asLanguageModel?.();
2963
- if (!model) {
2964
- throw new Error("Judge provider does not support language model interface");
2965
- }
2966
- const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
2967
- You must return a valid JSON object matching this schema:
2968
- {
2969
- "checks": [
2970
- {
2971
- "id": "string (rubric id)",
2972
- "satisfied": boolean,
2973
- "reasoning": "string (brief explanation)"
2974
- }
2975
- ],
2976
- "overall_reasoning": "string (summary)"
2977
- }`;
2978
- let result;
2979
- let lastError;
2980
- for (let attempt = 1; attempt <= 3; attempt++) {
2981
- try {
2982
- const { text } = await generateText2({
2983
- model,
2984
- system,
2985
- prompt
2986
- });
2987
- const cleaned = text.replace(/```json\n?|```/g, "").trim();
2988
- result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
2989
- break;
2990
- } catch (e) {
2991
- lastError = e instanceof Error ? e : new Error(String(e));
2992
- }
2993
- }
2994
- if (!result) {
2995
- throw new Error(
2996
- `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
2997
- );
2998
- }
2999
- const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
3000
- return {
3001
- score,
3002
- verdict,
3003
- hits,
3004
- misses,
3005
- expectedAspectCount: this.config.rubrics.length,
3006
- reasoning: result.overall_reasoning,
3007
- evaluatorRawRequest: {
3008
- prompt
3009
- }
3010
- };
3011
- }
3012
- buildPrompt(context, rubrics) {
3013
- const parts = [
3014
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3015
- "",
3016
- "[[ ## question ## ]]",
3017
- context.evalCase.question,
3018
- "",
3019
- "[[ ## expected_outcome ## ]]",
3020
- context.evalCase.expected_outcome,
3021
- ""
3022
- ];
3023
- if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3024
- parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3025
- }
3026
- parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3027
- for (const rubric of rubrics) {
3028
- const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3029
- const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3030
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3031
- }
3032
- parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3033
- return parts.join("\n");
3034
- }
3035
- calculateScore(result, rubrics) {
3036
- const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
3037
- const hits = [];
3038
- const misses = [];
3039
- let totalWeight = 0;
3040
- let earnedWeight = 0;
3041
- let failedRequired = false;
3042
- for (const check of result.checks) {
3043
- const rubric = rubricMap.get(check.id);
3044
- if (!rubric) {
3045
- continue;
3046
- }
3047
- totalWeight += rubric.weight;
3048
- if (check.satisfied) {
3049
- earnedWeight += rubric.weight;
3050
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3051
- } else {
3052
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3053
- if (rubric.required) {
3054
- failedRequired = true;
3055
- }
3056
- }
3057
- }
3058
- const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
3059
- let verdict;
3060
- if (failedRequired) {
3061
- verdict = "fail";
3062
- } else if (score >= 0.8) {
3063
- verdict = "pass";
3064
- } else if (score >= 0.6) {
3065
- verdict = "borderline";
3066
- } else {
3067
- verdict = "fail";
3068
- }
3069
- return { score, verdict, hits, misses };
3070
- }
3071
- };
3072
-
3073
- // src/evaluation/evaluators.ts
3074
3034
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3075
3035
 
3076
3036
  Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -3088,6 +3048,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
3088
3048
 
3089
3049
  [[ ## candidate_answer ## ]]
3090
3050
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
3051
+ var freeformEvaluationSchema = z.object({
3052
+ score: z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
3053
+ hits: z.array(z.string()).describe("Brief specific achievements").optional(),
3054
+ misses: z.array(z.string()).describe("Brief failures or omissions").optional(),
3055
+ reasoning: z.string().describe("Concise explanation (1-2 sentences)").optional()
3056
+ });
3057
+ var rubricCheckResultSchema = z.object({
3058
+ id: z.string().describe("The ID of the rubric item being checked"),
3059
+ satisfied: z.boolean().describe("Whether this rubric requirement is met"),
3060
+ reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
3061
+ });
3062
+ var rubricEvaluationSchema = z.object({
3063
+ checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
3064
+ overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
3065
+ });
3091
3066
  var LlmJudgeEvaluator = class {
3092
3067
  kind = "llm_judge";
3093
3068
  resolveJudgeProvider;
@@ -3105,9 +3080,13 @@ var LlmJudgeEvaluator = class {
3105
3080
  if (!judgeProvider) {
3106
3081
  throw new Error("No judge provider available for LLM grading");
3107
3082
  }
3108
- return this.evaluateWithPrompt(context, judgeProvider);
3083
+ const config = context.evaluator;
3084
+ if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
3085
+ return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
3086
+ }
3087
+ return this.evaluateFreeform(context, judgeProvider);
3109
3088
  }
3110
- async evaluateWithPrompt(context, judgeProvider) {
3089
+ async evaluateFreeform(context, judgeProvider) {
3111
3090
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3112
3091
  const variables = {
3113
3092
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
@@ -3124,34 +3103,132 @@ var LlmJudgeEvaluator = class {
3124
3103
  const systemPrompt = buildOutputSchema();
3125
3104
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
3126
3105
  const userPrompt = substituteVariables(evaluatorTemplate, variables);
3127
- const response = await judgeProvider.invoke({
3128
- question: userPrompt,
3129
- systemPrompt,
3130
- evalCaseId: context.evalCase.id,
3131
- attempt: context.attempt,
3132
- maxOutputTokens: this.maxOutputTokens,
3133
- temperature: this.temperature
3134
- });
3135
- const parsed = parseQualityResponse(response);
3136
- const score = clampScore(parsed.score ?? 0);
3137
- const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
3138
- const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
3139
- const reasoning = parsed.reasoning ?? response.reasoning;
3140
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3141
3106
  const evaluatorRawRequest = {
3142
3107
  userPrompt,
3143
3108
  systemPrompt,
3144
3109
  target: judgeProvider.targetName
3145
3110
  };
3111
+ try {
3112
+ const { data, providerResponse } = await this.runWithRetry({
3113
+ context,
3114
+ judgeProvider,
3115
+ systemPrompt,
3116
+ userPrompt,
3117
+ schema: freeformEvaluationSchema
3118
+ });
3119
+ const score = clampScore(data.score);
3120
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
3121
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
3122
+ const reasoning = data.reasoning ?? providerResponse?.reasoning;
3123
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3124
+ return {
3125
+ score,
3126
+ verdict: scoreToVerdict(score),
3127
+ hits,
3128
+ misses,
3129
+ expectedAspectCount,
3130
+ reasoning,
3131
+ evaluatorRawRequest
3132
+ };
3133
+ } catch {
3134
+ return {
3135
+ score: 0,
3136
+ verdict: "fail",
3137
+ hits: [],
3138
+ misses: [],
3139
+ expectedAspectCount: 1,
3140
+ evaluatorRawRequest
3141
+ };
3142
+ }
3143
+ }
3144
+ async evaluateWithRubrics(context, judgeProvider, rubrics) {
3145
+ if (!rubrics || rubrics.length === 0) {
3146
+ throw new Error(
3147
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
3148
+ );
3149
+ }
3150
+ const prompt = this.buildRubricPrompt(context, rubrics);
3151
+ const systemPrompt = buildRubricOutputSchema();
3152
+ const evaluatorRawRequest = {
3153
+ userPrompt: prompt,
3154
+ systemPrompt,
3155
+ target: judgeProvider.targetName
3156
+ };
3157
+ const { data } = await this.runWithRetry({
3158
+ context,
3159
+ judgeProvider,
3160
+ systemPrompt,
3161
+ userPrompt: prompt,
3162
+ schema: rubricEvaluationSchema
3163
+ });
3164
+ const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
3146
3165
  return {
3147
3166
  score,
3167
+ verdict,
3148
3168
  hits,
3149
3169
  misses,
3150
- expectedAspectCount,
3151
- reasoning,
3170
+ expectedAspectCount: rubrics.length,
3171
+ reasoning: data.overall_reasoning,
3152
3172
  evaluatorRawRequest
3153
3173
  };
3154
3174
  }
3175
+ buildRubricPrompt(context, rubrics) {
3176
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3177
+ const parts = [
3178
+ "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3179
+ "",
3180
+ "[[ ## question ## ]]",
3181
+ formattedQuestion,
3182
+ "",
3183
+ "[[ ## expected_outcome ## ]]",
3184
+ context.evalCase.expected_outcome,
3185
+ ""
3186
+ ];
3187
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3188
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3189
+ }
3190
+ parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3191
+ for (const rubric of rubrics) {
3192
+ const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3193
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3194
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3195
+ }
3196
+ parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3197
+ return parts.join("\n");
3198
+ }
3199
+ async runWithRetry(options) {
3200
+ const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
3201
+ let lastError;
3202
+ for (let attempt = 1; attempt <= 3; attempt++) {
3203
+ try {
3204
+ const model = judgeProvider.asLanguageModel?.();
3205
+ if (model) {
3206
+ const { text } = await generateText2({
3207
+ model,
3208
+ system: systemPrompt,
3209
+ prompt: userPrompt,
3210
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
3211
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
3212
+ });
3213
+ const data2 = schema.parse(parseJsonFromText(text));
3214
+ return { data: data2 };
3215
+ }
3216
+ const response = await judgeProvider.invoke({
3217
+ question: userPrompt,
3218
+ systemPrompt,
3219
+ evalCaseId: context.evalCase.id,
3220
+ attempt: context.attempt,
3221
+ maxOutputTokens: this.maxOutputTokens,
3222
+ temperature: this.temperature
3223
+ });
3224
+ const data = schema.parse(parseJsonFromText(response.text ?? ""));
3225
+ return { data, providerResponse: response };
3226
+ } catch (e) {
3227
+ lastError = e instanceof Error ? e : new Error(String(e));
3228
+ }
3229
+ }
3230
+ throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
3231
+ }
3155
3232
  };
3156
3233
  function buildOutputSchema() {
3157
3234
  return [
@@ -3165,6 +3242,29 @@ function buildOutputSchema() {
3165
3242
  "}"
3166
3243
  ].join("\n");
3167
3244
  }
3245
+ function buildRubricOutputSchema() {
3246
+ return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
3247
+ You must return a valid JSON object matching this schema:
3248
+ {
3249
+ "checks": [
3250
+ {
3251
+ "id": "string (rubric id)",
3252
+ "satisfied": boolean,
3253
+ "reasoning": "string (brief explanation)"
3254
+ }
3255
+ ],
3256
+ "overall_reasoning": "string (summary)"
3257
+ }`;
3258
+ }
3259
+ function scoreToVerdict(score) {
3260
+ if (score >= 0.8) {
3261
+ return "pass";
3262
+ }
3263
+ if (score >= 0.6) {
3264
+ return "borderline";
3265
+ }
3266
+ return "fail";
3267
+ }
3168
3268
  function clampScore(value) {
3169
3269
  if (Number.isNaN(value) || !Number.isFinite(value)) {
3170
3270
  return 0;
@@ -3177,71 +3277,15 @@ function clampScore(value) {
3177
3277
  }
3178
3278
  return value;
3179
3279
  }
3180
- function parseQualityResponse(response) {
3181
- const text = typeof response.text === "string" ? response.text.trim() : "";
3182
- if (text.length === 0) {
3183
- return {};
3184
- }
3185
- const direct = attemptParseJson(text);
3186
- if (direct && validateQualityJson(direct)) {
3187
- return direct;
3188
- }
3189
- const extracted = extractJsonBlob(text);
3190
- if (extracted) {
3191
- const parsed = attemptParseJson(extracted);
3192
- if (parsed && validateQualityJson(parsed)) {
3193
- return parsed;
3194
- }
3195
- }
3196
- return {};
3197
- }
3198
- function attemptParseJson(text) {
3199
- try {
3200
- const parsed = JSON.parse(text);
3201
- const score = typeof parsed.score === "number" ? parsed.score : void 0;
3202
- const hits = parsed.hits;
3203
- const misses = parsed.misses;
3204
- const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
3205
- return { score, hits, misses, reasoning };
3206
- } catch {
3207
- return void 0;
3208
- }
3209
- }
3210
- function validateQualityJson(parsed) {
3211
- if (typeof parsed.score !== "number") {
3212
- return false;
3213
- }
3214
- if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
3215
- return false;
3216
- }
3217
- if (parsed.score < 0 || parsed.score > 1) {
3218
- return false;
3219
- }
3220
- if (parsed.hits !== void 0) {
3221
- if (!Array.isArray(parsed.hits)) {
3222
- return false;
3223
- }
3224
- if (!parsed.hits.every((item) => typeof item === "string")) {
3225
- return false;
3226
- }
3227
- }
3228
- if (parsed.misses !== void 0) {
3229
- if (!Array.isArray(parsed.misses)) {
3230
- return false;
3231
- }
3232
- if (!parsed.misses.every((item) => typeof item === "string")) {
3233
- return false;
3234
- }
3235
- }
3236
- if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
3237
- return false;
3238
- }
3239
- return true;
3240
- }
3241
3280
  function extractJsonBlob(text) {
3242
3281
  const match = text.match(/\{[\s\S]*\}/);
3243
3282
  return match?.[0];
3244
3283
  }
3284
+ function parseJsonFromText(text) {
3285
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
3286
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
3287
+ return JSON.parse(blob);
3288
+ }
3245
3289
  function isNonEmptyString(value) {
3246
3290
  return typeof value === "string" && value.trim().length > 0;
3247
3291
  }
@@ -3278,6 +3322,7 @@ var CodeEvaluator = class {
3278
3322
  const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
3279
3323
  return {
3280
3324
  score,
3325
+ verdict: scoreToVerdict(score),
3281
3326
  hits,
3282
3327
  misses,
3283
3328
  expectedAspectCount: hits.length + misses.length || 1,
@@ -3291,6 +3336,7 @@ var CodeEvaluator = class {
3291
3336
  const message = error instanceof Error ? error.message : String(error);
3292
3337
  return {
3293
3338
  score: 0,
3339
+ verdict: "fail",
3294
3340
  hits: [],
3295
3341
  misses: [`Code evaluator failed: ${message}`],
3296
3342
  expectedAspectCount: 1,
@@ -3304,6 +3350,33 @@ var CodeEvaluator = class {
3304
3350
  }
3305
3351
  }
3306
3352
  };
3353
+ function calculateRubricScore(result, rubrics) {
3354
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
3355
+ const hits = [];
3356
+ const misses = [];
3357
+ let totalWeight = 0;
3358
+ let earnedWeight = 0;
3359
+ let failedRequired = false;
3360
+ for (const check of result.checks) {
3361
+ const rubric = rubricMap.get(check.id);
3362
+ if (!rubric) {
3363
+ continue;
3364
+ }
3365
+ totalWeight += rubric.weight;
3366
+ if (check.satisfied) {
3367
+ earnedWeight += rubric.weight;
3368
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3369
+ } else {
3370
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3371
+ if (rubric.required) {
3372
+ failedRequired = true;
3373
+ }
3374
+ }
3375
+ }
3376
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
3377
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
3378
+ return { score, verdict, hits, misses };
3379
+ }
3307
3380
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
3308
3381
  const { spawn: spawn2 } = await import("node:child_process");
3309
3382
  return await new Promise((resolve, reject) => {
@@ -3355,6 +3428,228 @@ function substituteVariables(template, variables) {
3355
3428
  return variables[varName] ?? match;
3356
3429
  });
3357
3430
  }
3431
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
3432
+ {{EVALUATOR_RESULTS_JSON}}
3433
+
3434
+ Decide the final score and verdict based on all evaluator results.
3435
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
3436
+ var CompositeEvaluator = class {
3437
+ kind = "composite";
3438
+ config;
3439
+ evaluatorFactory;
3440
+ cwd;
3441
+ constructor(options) {
3442
+ this.config = options.config;
3443
+ this.evaluatorFactory = options.evaluatorFactory;
3444
+ this.cwd = options.cwd;
3445
+ }
3446
+ async evaluate(context) {
3447
+ const memberResults = await Promise.all(
3448
+ this.config.evaluators.map(async (memberConfig) => {
3449
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
3450
+ return {
3451
+ id: memberConfig.name,
3452
+ type: memberConfig.type,
3453
+ result: await evaluator.evaluate(context)
3454
+ };
3455
+ })
3456
+ );
3457
+ return this.aggregate(memberResults, context);
3458
+ }
3459
+ async aggregate(results, context) {
3460
+ const aggregator = this.config.aggregator;
3461
+ switch (aggregator.type) {
3462
+ case "code_judge":
3463
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
3464
+ case "llm_judge":
3465
+ return this.runLlmAggregator(results, context, aggregator);
3466
+ default:
3467
+ return this.runWeightedAverage(results, aggregator.weights);
3468
+ }
3469
+ }
3470
+ runWeightedAverage(results, weights) {
3471
+ let totalWeight = 0;
3472
+ let weightedSum = 0;
3473
+ const allHits = [];
3474
+ const allMisses = [];
3475
+ const reasoningParts = [];
3476
+ const evaluatorResults = [];
3477
+ for (const member of results) {
3478
+ const weight = weights?.[member.id] ?? 1;
3479
+ totalWeight += weight;
3480
+ weightedSum += member.result.score * weight;
3481
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
3482
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
3483
+ if (member.result.reasoning) {
3484
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
3485
+ }
3486
+ evaluatorResults.push({
3487
+ name: member.id,
3488
+ type: member.type,
3489
+ score: member.result.score,
3490
+ weight,
3491
+ verdict: member.result.verdict,
3492
+ hits: [...member.result.hits],
3493
+ misses: [...member.result.misses],
3494
+ reasoning: member.result.reasoning,
3495
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
3496
+ evaluatorResults: member.result.evaluatorResults
3497
+ });
3498
+ }
3499
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
3500
+ return {
3501
+ score: clampScore(finalScore),
3502
+ verdict: scoreToVerdict(finalScore),
3503
+ hits: allHits,
3504
+ misses: allMisses,
3505
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
3506
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
3507
+ evaluatorRawRequest: {
3508
+ aggregator: "weighted_average",
3509
+ ...weights ? { weights } : {}
3510
+ },
3511
+ evaluatorResults
3512
+ };
3513
+ }
3514
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
3515
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
3516
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
3517
+ const evaluatorResults = results.map((member) => ({
3518
+ name: member.id,
3519
+ type: member.type,
3520
+ score: member.result.score,
3521
+ weight: weights?.[member.id] ?? 1,
3522
+ verdict: member.result.verdict,
3523
+ hits: [...member.result.hits],
3524
+ misses: [...member.result.misses],
3525
+ reasoning: member.result.reasoning,
3526
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
3527
+ evaluatorResults: member.result.evaluatorResults
3528
+ }));
3529
+ try {
3530
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
3531
+ const parsed = parseJsonSafe(stdout);
3532
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
3533
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
3534
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
3535
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
3536
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
3537
+ return {
3538
+ score,
3539
+ verdict,
3540
+ hits,
3541
+ misses,
3542
+ expectedAspectCount: hits.length + misses.length || 1,
3543
+ reasoning,
3544
+ evaluatorRawRequest: {
3545
+ aggregator: "code_judge",
3546
+ script: scriptPath
3547
+ },
3548
+ evaluatorResults
3549
+ };
3550
+ } catch (error) {
3551
+ const message = error instanceof Error ? error.message : String(error);
3552
+ return {
3553
+ score: 0,
3554
+ verdict: "fail",
3555
+ hits: [],
3556
+ misses: [`Code aggregator failed: ${message}`],
3557
+ expectedAspectCount: 1,
3558
+ reasoning: message,
3559
+ evaluatorRawRequest: {
3560
+ aggregator: "code_judge",
3561
+ script: scriptPath,
3562
+ error: message
3563
+ },
3564
+ evaluatorResults
3565
+ };
3566
+ }
3567
+ }
3568
+ async runLlmAggregator(results, context, config) {
3569
+ const judgeProvider = context.judgeProvider;
3570
+ if (!judgeProvider) {
3571
+ throw new Error("No judge provider available for LLM aggregation");
3572
+ }
3573
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
3574
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
3575
+ const evaluatorResults = results.map((member) => ({
3576
+ name: member.id,
3577
+ type: member.type,
3578
+ score: member.result.score,
3579
+ verdict: member.result.verdict,
3580
+ hits: [...member.result.hits],
3581
+ misses: [...member.result.misses],
3582
+ reasoning: member.result.reasoning,
3583
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
3584
+ evaluatorResults: member.result.evaluatorResults
3585
+ }));
3586
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
3587
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
3588
+ const systemPrompt = buildOutputSchema();
3589
+ const evaluatorRawRequest = {
3590
+ aggregator: "llm_judge",
3591
+ userPrompt,
3592
+ systemPrompt,
3593
+ target: judgeProvider.targetName
3594
+ };
3595
+ try {
3596
+ const model = judgeProvider.asLanguageModel?.();
3597
+ if (model) {
3598
+ const { text } = await generateText2({
3599
+ model,
3600
+ system: systemPrompt,
3601
+ prompt: userPrompt
3602
+ });
3603
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
3604
+ const score2 = clampScore(data2.score);
3605
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
3606
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
3607
+ const reasoning2 = data2.reasoning;
3608
+ return {
3609
+ score: score2,
3610
+ verdict: scoreToVerdict(score2),
3611
+ hits: hits2,
3612
+ misses: misses2,
3613
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
3614
+ reasoning: reasoning2,
3615
+ evaluatorRawRequest,
3616
+ evaluatorResults
3617
+ };
3618
+ }
3619
+ const response = await judgeProvider.invoke({
3620
+ question: userPrompt,
3621
+ systemPrompt,
3622
+ evalCaseId: context.evalCase.id,
3623
+ attempt: context.attempt
3624
+ });
3625
+ const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
3626
+ const score = clampScore(data.score);
3627
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
3628
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
3629
+ const reasoning = data.reasoning ?? response.reasoning;
3630
+ return {
3631
+ score,
3632
+ verdict: scoreToVerdict(score),
3633
+ hits,
3634
+ misses,
3635
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
3636
+ reasoning,
3637
+ evaluatorRawRequest,
3638
+ evaluatorResults
3639
+ };
3640
+ } catch {
3641
+ return {
3642
+ score: 0,
3643
+ verdict: "fail",
3644
+ hits: [],
3645
+ misses: [],
3646
+ expectedAspectCount: 1,
3647
+ evaluatorRawRequest,
3648
+ evaluatorResults
3649
+ };
3650
+ }
3651
+ }
3652
+ };
3358
3653
 
3359
3654
  // src/evaluation/orchestrator.ts
3360
3655
  import { createHash, randomUUID as randomUUID2 } from "node:crypto";
@@ -4040,7 +4335,6 @@ async function runEvaluatorList(options) {
4040
4335
  reasoning: score2.reasoning,
4041
4336
  evaluator_provider_request: score2.evaluatorRawRequest
4042
4337
  });
4043
- continue;
4044
4338
  }
4045
4339
  if (evaluator.type === "code") {
4046
4340
  const codeEvaluator = new CodeEvaluator({
@@ -4057,10 +4351,10 @@ async function runEvaluatorList(options) {
4057
4351
  promptInputs,
4058
4352
  now
4059
4353
  });
4060
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4354
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
4061
4355
  evaluatorResults.push({
4062
4356
  name: evaluator.name,
4063
- type: evaluator.type,
4357
+ type: "code_judge",
4064
4358
  score: score2.score,
4065
4359
  verdict: score2.verdict,
4066
4360
  hits: score2.hits,
@@ -4068,19 +4362,37 @@ async function runEvaluatorList(options) {
4068
4362
  reasoning: score2.reasoning,
4069
4363
  evaluator_provider_request: score2.evaluatorRawRequest
4070
4364
  });
4071
- continue;
4072
4365
  }
4073
- if (evaluator.type === "rubric") {
4074
- const rubricEvaluator = new RubricEvaluator({
4075
- config: evaluator,
4076
- resolveJudgeProvider: async (context) => {
4077
- if (context.judgeProvider) {
4078
- return context.judgeProvider;
4366
+ if (evaluator.type === "composite") {
4367
+ const evalFileDir = evalCase.guideline_paths[0] ? path12.dirname(evalCase.guideline_paths[0]) : process.cwd();
4368
+ const createEvaluator = (memberConfig) => {
4369
+ switch (memberConfig.type) {
4370
+ case "llm_judge":
4371
+ return evaluatorRegistry.llm_judge;
4372
+ case "code":
4373
+ return new CodeEvaluator({
4374
+ script: memberConfig.script,
4375
+ cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
4376
+ agentTimeoutMs
4377
+ });
4378
+ case "composite":
4379
+ return new CompositeEvaluator({
4380
+ config: memberConfig,
4381
+ cwd: evalFileDir,
4382
+ evaluatorFactory: { create: createEvaluator }
4383
+ });
4384
+ default: {
4385
+ const unknownConfig = memberConfig;
4386
+ throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
4079
4387
  }
4080
- return judgeProvider;
4081
4388
  }
4389
+ };
4390
+ const compositeEvaluator = new CompositeEvaluator({
4391
+ config: evaluator,
4392
+ cwd: evalFileDir,
4393
+ evaluatorFactory: { create: createEvaluator }
4082
4394
  });
4083
- const score2 = await rubricEvaluator.evaluate({
4395
+ const score2 = await compositeEvaluator.evaluate({
4084
4396
  evalCase,
4085
4397
  candidate,
4086
4398
  target,
@@ -4099,27 +4411,31 @@ async function runEvaluatorList(options) {
4099
4411
  hits: score2.hits,
4100
4412
  misses: score2.misses,
4101
4413
  reasoning: score2.reasoning,
4102
- evaluator_provider_request: score2.evaluatorRawRequest
4414
+ evaluator_provider_request: score2.evaluatorRawRequest,
4415
+ evaluator_results: mapChildResults(score2.evaluatorResults)
4103
4416
  });
4104
4417
  }
4105
4418
  } catch (error) {
4106
4419
  const message = error instanceof Error ? error.message : String(error);
4107
4420
  const fallbackScore = {
4108
4421
  score: 0,
4422
+ verdict: "fail",
4109
4423
  hits: [],
4110
4424
  misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
4111
4425
  expectedAspectCount: 1,
4112
4426
  reasoning: message
4113
4427
  };
4428
+ const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
4114
4429
  scored.push({
4115
4430
  score: fallbackScore,
4116
4431
  name: evaluator.name ?? "unknown",
4117
- type: evaluator.type ?? "unknown"
4432
+ type: resultType ?? "llm_judge"
4118
4433
  });
4119
4434
  evaluatorResults.push({
4120
4435
  name: evaluator.name ?? "unknown",
4121
- type: evaluator.type ?? "unknown",
4436
+ type: resultType ?? "llm_judge",
4122
4437
  score: 0,
4438
+ verdict: "fail",
4123
4439
  hits: [],
4124
4440
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
4125
4441
  reasoning: message
@@ -4138,6 +4454,7 @@ async function runEvaluatorList(options) {
4138
4454
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
4139
4455
  const score = {
4140
4456
  score: aggregateScore,
4457
+ verdict: scoreToVerdict2(aggregateScore),
4141
4458
  hits,
4142
4459
  misses,
4143
4460
  expectedAspectCount,
@@ -4188,6 +4505,15 @@ async function resolveCustomPrompt(config) {
4188
4505
  function isNonEmptyString2(value) {
4189
4506
  return typeof value === "string" && value.trim().length > 0;
4190
4507
  }
4508
+ function scoreToVerdict2(score) {
4509
+ if (score >= 0.8) {
4510
+ return "pass";
4511
+ }
4512
+ if (score >= 0.6) {
4513
+ return "borderline";
4514
+ }
4515
+ return "fail";
4516
+ }
4191
4517
  function filterEvalCases(evalCases, evalId) {
4192
4518
  if (!evalId) {
4193
4519
  return evalCases;
@@ -4325,6 +4651,23 @@ function isTimeoutLike(error) {
4325
4651
  const value = String(error).toLowerCase();
4326
4652
  return value.includes("timeout");
4327
4653
  }
4654
+ function mapChildResults(children) {
4655
+ if (!children || children.length === 0) {
4656
+ return void 0;
4657
+ }
4658
+ return children.map((child) => ({
4659
+ name: child.name,
4660
+ type: child.type,
4661
+ score: child.score,
4662
+ weight: child.weight,
4663
+ verdict: child.verdict,
4664
+ hits: child.hits,
4665
+ misses: child.misses,
4666
+ reasoning: child.reasoning,
4667
+ evaluator_provider_request: child.evaluatorRawRequest,
4668
+ evaluator_results: mapChildResults(child.evaluatorResults)
4669
+ }));
4670
+ }
4328
4671
 
4329
4672
  // src/evaluation/generators/rubric-generator.ts
4330
4673
  import { generateText as generateText3 } from "ai";
@@ -4412,8 +4755,8 @@ function createAgentKernel() {
4412
4755
  }
4413
4756
  export {
4414
4757
  CodeEvaluator,
4758
+ CompositeEvaluator,
4415
4759
  LlmJudgeEvaluator,
4416
- RubricEvaluator,
4417
4760
  TEST_MESSAGE_ROLES,
4418
4761
  buildDirectoryChain,
4419
4762
  buildPromptInputs,