@agentv/core 0.22.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -31,8 +31,8 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
+ CompositeEvaluator: () => CompositeEvaluator,
34
35
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
35
- RubricEvaluator: () => RubricEvaluator,
36
36
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
37
  buildDirectoryChain: () => buildDirectoryChain2,
38
38
  buildPromptInputs: () => buildPromptInputs,
@@ -108,7 +108,7 @@ function isTestMessage(value) {
108
108
  }
109
109
  return candidate.content.every(isJsonObject);
110
110
  }
111
- var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
111
+ var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
112
112
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
113
113
  function isEvaluatorKind(value) {
114
114
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -460,10 +460,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
460
460
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
461
461
  continue;
462
462
  }
463
- if (typeValue === "code") {
463
+ if (typeValue === "code_judge") {
464
464
  const script = asString2(rawEvaluator.script);
465
465
  if (!script) {
466
- logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
466
+ logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
467
467
  continue;
468
468
  }
469
469
  const cwd = asString2(rawEvaluator.cwd);
@@ -474,7 +474,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
474
474
  resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
475
475
  } else {
476
476
  logWarning2(
477
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
477
+ `Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
478
478
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
479
479
  );
480
480
  }
@@ -490,6 +490,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
490
490
  });
491
491
  continue;
492
492
  }
493
+ if (typeValue === "composite") {
494
+ const rawMembers = rawEvaluator.evaluators;
495
+ if (!Array.isArray(rawMembers)) {
496
+ logWarning2(
497
+ `Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
498
+ );
499
+ continue;
500
+ }
501
+ const rawAggregator = rawEvaluator.aggregator;
502
+ if (!isJsonObject2(rawAggregator)) {
503
+ logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
504
+ continue;
505
+ }
506
+ const aggregatorType = asString2(rawAggregator.type);
507
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
508
+ logWarning2(
509
+ `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
510
+ );
511
+ continue;
512
+ }
513
+ const memberEvaluators = [];
514
+ for (const rawMember of rawMembers) {
515
+ if (!isJsonObject2(rawMember)) {
516
+ logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
517
+ continue;
518
+ }
519
+ const memberName = asString2(rawMember.name);
520
+ const memberType = rawMember.type;
521
+ if (!memberName || !isEvaluatorKind(memberType)) {
522
+ logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
523
+ continue;
524
+ }
525
+ const memberConfigs = await parseEvaluators(
526
+ { evaluators: [rawMember] },
527
+ void 0,
528
+ searchRoots,
529
+ `${evalId}:${name}:${memberName}`
530
+ );
531
+ if (memberConfigs && memberConfigs.length > 0) {
532
+ memberEvaluators.push(memberConfigs[0]);
533
+ }
534
+ }
535
+ if (memberEvaluators.length === 0) {
536
+ logWarning2(
537
+ `Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
538
+ );
539
+ continue;
540
+ }
541
+ let aggregator;
542
+ if (aggregatorType === "weighted_average") {
543
+ const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
544
+ const parsedWeights = {};
545
+ if (weights) {
546
+ for (const [key, value] of Object.entries(weights)) {
547
+ if (typeof value === "number") {
548
+ parsedWeights[key] = value;
549
+ }
550
+ }
551
+ }
552
+ aggregator = {
553
+ type: "weighted_average",
554
+ ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
555
+ };
556
+ } else if (aggregatorType === "code_judge") {
557
+ const aggregatorPath = asString2(rawAggregator.path);
558
+ if (!aggregatorPath) {
559
+ logWarning2(
560
+ `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
561
+ );
562
+ continue;
563
+ }
564
+ aggregator = {
565
+ type: "code_judge",
566
+ path: aggregatorPath,
567
+ cwd: searchRoots[0]
568
+ };
569
+ } else {
570
+ const aggregatorPrompt = asString2(rawAggregator.prompt);
571
+ let promptPath2;
572
+ if (aggregatorPrompt) {
573
+ const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
574
+ if (resolved.resolvedPath) {
575
+ promptPath2 = import_node_path3.default.resolve(resolved.resolvedPath);
576
+ }
577
+ }
578
+ aggregator = {
579
+ type: "llm_judge",
580
+ ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
581
+ ...promptPath2 ? { promptPath: promptPath2 } : {}
582
+ };
583
+ }
584
+ evaluators.push({
585
+ name,
586
+ type: "composite",
587
+ evaluators: memberEvaluators,
588
+ aggregator
589
+ });
590
+ continue;
591
+ }
493
592
  const prompt = asString2(rawEvaluator.prompt);
494
593
  let promptPath;
495
594
  if (prompt) {
@@ -510,25 +609,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
510
609
  }
511
610
  }
512
611
  const _model = asString2(rawEvaluator.model);
612
+ const rawRubrics = rawEvaluator.rubrics;
613
+ const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
614
+ id: asString2(rubric.id) ?? `rubric-${index + 1}`,
615
+ description: asString2(rubric.description) ?? "",
616
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
617
+ required: typeof rubric.required === "boolean" ? rubric.required : true
618
+ })).filter((r) => r.description.length > 0) : void 0;
513
619
  if (typeValue === "rubric") {
514
- const rubrics = rawEvaluator.rubrics;
515
- if (!Array.isArray(rubrics)) {
620
+ if (!parsedRubrics) {
516
621
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
517
622
  continue;
518
623
  }
519
- const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
520
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
521
- description: asString2(rubric.description) ?? "",
522
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
523
- required: typeof rubric.required === "boolean" ? rubric.required : true
524
- })).filter((r) => r.description.length > 0);
525
624
  if (parsedRubrics.length === 0) {
526
625
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
527
626
  continue;
528
627
  }
529
628
  evaluators.push({
530
629
  name,
531
- type: "rubric",
630
+ type: "llm_judge",
532
631
  rubrics: parsedRubrics
533
632
  });
534
633
  continue;
@@ -537,7 +636,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
537
636
  name,
538
637
  type: "llm_judge",
539
638
  prompt,
540
- promptPath
639
+ promptPath,
640
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
541
641
  });
542
642
  }
543
643
  return evaluators.length > 0 ? evaluators : void 0;
@@ -1088,7 +1188,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1088
1188
  if (rubricItems.length > 0) {
1089
1189
  const rubricEvaluator = {
1090
1190
  name: "rubric",
1091
- type: "rubric",
1191
+ type: "llm_judge",
1092
1192
  rubrics: rubricItems
1093
1193
  };
1094
1194
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
@@ -3621,149 +3721,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
3621
3721
  return createProvider(resolved);
3622
3722
  }
3623
3723
 
3624
- // src/evaluation/evaluators/rubric-evaluator.ts
3724
+ // src/evaluation/evaluators.ts
3625
3725
  var import_ai2 = require("ai");
3626
3726
  var import_zod2 = require("zod");
3627
- var rubricCheckResultSchema = import_zod2.z.object({
3628
- id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
3629
- satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
3630
- reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
3631
- });
3632
- var rubricEvaluationSchema = import_zod2.z.object({
3633
- checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
3634
- overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
3635
- });
3636
- var RubricEvaluator = class {
3637
- kind = "rubric";
3638
- config;
3639
- resolveJudgeProvider;
3640
- constructor(options) {
3641
- this.config = options.config;
3642
- this.resolveJudgeProvider = options.resolveJudgeProvider;
3643
- }
3644
- async evaluate(context) {
3645
- const judgeProvider = await this.resolveJudgeProvider(context);
3646
- if (!judgeProvider) {
3647
- throw new Error("No judge provider available for rubric evaluation");
3648
- }
3649
- if (!this.config.rubrics || this.config.rubrics.length === 0) {
3650
- throw new Error(
3651
- `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
3652
- );
3653
- }
3654
- const prompt = this.buildPrompt(context, this.config.rubrics);
3655
- const model = judgeProvider.asLanguageModel?.();
3656
- if (!model) {
3657
- throw new Error("Judge provider does not support language model interface");
3658
- }
3659
- const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
3660
- You must return a valid JSON object matching this schema:
3661
- {
3662
- "checks": [
3663
- {
3664
- "id": "string (rubric id)",
3665
- "satisfied": boolean,
3666
- "reasoning": "string (brief explanation)"
3667
- }
3668
- ],
3669
- "overall_reasoning": "string (summary)"
3670
- }`;
3671
- let result;
3672
- let lastError;
3673
- for (let attempt = 1; attempt <= 3; attempt++) {
3674
- try {
3675
- const { text } = await (0, import_ai2.generateText)({
3676
- model,
3677
- system,
3678
- prompt
3679
- });
3680
- const cleaned = text.replace(/```json\n?|```/g, "").trim();
3681
- result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
3682
- break;
3683
- } catch (e) {
3684
- lastError = e instanceof Error ? e : new Error(String(e));
3685
- }
3686
- }
3687
- if (!result) {
3688
- throw new Error(
3689
- `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
3690
- );
3691
- }
3692
- const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
3693
- return {
3694
- score,
3695
- verdict,
3696
- hits,
3697
- misses,
3698
- expectedAspectCount: this.config.rubrics.length,
3699
- reasoning: result.overall_reasoning,
3700
- evaluatorRawRequest: {
3701
- prompt
3702
- }
3703
- };
3704
- }
3705
- buildPrompt(context, rubrics) {
3706
- const parts = [
3707
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3708
- "",
3709
- "[[ ## question ## ]]",
3710
- context.evalCase.question,
3711
- "",
3712
- "[[ ## expected_outcome ## ]]",
3713
- context.evalCase.expected_outcome,
3714
- ""
3715
- ];
3716
- if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3717
- parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3718
- }
3719
- parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3720
- for (const rubric of rubrics) {
3721
- const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3722
- const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3723
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3724
- }
3725
- parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3726
- return parts.join("\n");
3727
- }
3728
- calculateScore(result, rubrics) {
3729
- const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
3730
- const hits = [];
3731
- const misses = [];
3732
- let totalWeight = 0;
3733
- let earnedWeight = 0;
3734
- let failedRequired = false;
3735
- for (const check of result.checks) {
3736
- const rubric = rubricMap.get(check.id);
3737
- if (!rubric) {
3738
- continue;
3739
- }
3740
- totalWeight += rubric.weight;
3741
- if (check.satisfied) {
3742
- earnedWeight += rubric.weight;
3743
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3744
- } else {
3745
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3746
- if (rubric.required) {
3747
- failedRequired = true;
3748
- }
3749
- }
3750
- }
3751
- const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
3752
- let verdict;
3753
- if (failedRequired) {
3754
- verdict = "fail";
3755
- } else if (score >= 0.8) {
3756
- verdict = "pass";
3757
- } else if (score >= 0.6) {
3758
- verdict = "borderline";
3759
- } else {
3760
- verdict = "fail";
3761
- }
3762
- return { score, verdict, hits, misses };
3763
- }
3764
- };
3765
-
3766
- // src/evaluation/evaluators.ts
3767
3727
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3768
3728
 
3769
3729
  Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -3781,6 +3741,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
3781
3741
 
3782
3742
  [[ ## candidate_answer ## ]]
3783
3743
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
3744
+ var freeformEvaluationSchema = import_zod2.z.object({
3745
+ score: import_zod2.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
3746
+ hits: import_zod2.z.array(import_zod2.z.string()).describe("Brief specific achievements").optional(),
3747
+ misses: import_zod2.z.array(import_zod2.z.string()).describe("Brief failures or omissions").optional(),
3748
+ reasoning: import_zod2.z.string().describe("Concise explanation (1-2 sentences)").optional()
3749
+ });
3750
+ var rubricCheckResultSchema = import_zod2.z.object({
3751
+ id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
3752
+ satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
3753
+ reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
3754
+ });
3755
+ var rubricEvaluationSchema = import_zod2.z.object({
3756
+ checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
3757
+ overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
3758
+ });
3784
3759
  var LlmJudgeEvaluator = class {
3785
3760
  kind = "llm_judge";
3786
3761
  resolveJudgeProvider;
@@ -3798,9 +3773,13 @@ var LlmJudgeEvaluator = class {
3798
3773
  if (!judgeProvider) {
3799
3774
  throw new Error("No judge provider available for LLM grading");
3800
3775
  }
3801
- return this.evaluateWithPrompt(context, judgeProvider);
3776
+ const config = context.evaluator;
3777
+ if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
3778
+ return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
3779
+ }
3780
+ return this.evaluateFreeform(context, judgeProvider);
3802
3781
  }
3803
- async evaluateWithPrompt(context, judgeProvider) {
3782
+ async evaluateFreeform(context, judgeProvider) {
3804
3783
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3805
3784
  const variables = {
3806
3785
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
@@ -3817,34 +3796,132 @@ var LlmJudgeEvaluator = class {
3817
3796
  const systemPrompt = buildOutputSchema();
3818
3797
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
3819
3798
  const userPrompt = substituteVariables(evaluatorTemplate, variables);
3820
- const response = await judgeProvider.invoke({
3821
- question: userPrompt,
3822
- systemPrompt,
3823
- evalCaseId: context.evalCase.id,
3824
- attempt: context.attempt,
3825
- maxOutputTokens: this.maxOutputTokens,
3826
- temperature: this.temperature
3827
- });
3828
- const parsed = parseQualityResponse(response);
3829
- const score = clampScore(parsed.score ?? 0);
3830
- const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
3831
- const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
3832
- const reasoning = parsed.reasoning ?? response.reasoning;
3833
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3834
3799
  const evaluatorRawRequest = {
3835
3800
  userPrompt,
3836
3801
  systemPrompt,
3837
3802
  target: judgeProvider.targetName
3838
3803
  };
3804
+ try {
3805
+ const { data, providerResponse } = await this.runWithRetry({
3806
+ context,
3807
+ judgeProvider,
3808
+ systemPrompt,
3809
+ userPrompt,
3810
+ schema: freeformEvaluationSchema
3811
+ });
3812
+ const score = clampScore(data.score);
3813
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
3814
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
3815
+ const reasoning = data.reasoning ?? providerResponse?.reasoning;
3816
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3817
+ return {
3818
+ score,
3819
+ verdict: scoreToVerdict(score),
3820
+ hits,
3821
+ misses,
3822
+ expectedAspectCount,
3823
+ reasoning,
3824
+ evaluatorRawRequest
3825
+ };
3826
+ } catch {
3827
+ return {
3828
+ score: 0,
3829
+ verdict: "fail",
3830
+ hits: [],
3831
+ misses: [],
3832
+ expectedAspectCount: 1,
3833
+ evaluatorRawRequest
3834
+ };
3835
+ }
3836
+ }
3837
+ async evaluateWithRubrics(context, judgeProvider, rubrics) {
3838
+ if (!rubrics || rubrics.length === 0) {
3839
+ throw new Error(
3840
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
3841
+ );
3842
+ }
3843
+ const prompt = this.buildRubricPrompt(context, rubrics);
3844
+ const systemPrompt = buildRubricOutputSchema();
3845
+ const evaluatorRawRequest = {
3846
+ userPrompt: prompt,
3847
+ systemPrompt,
3848
+ target: judgeProvider.targetName
3849
+ };
3850
+ const { data } = await this.runWithRetry({
3851
+ context,
3852
+ judgeProvider,
3853
+ systemPrompt,
3854
+ userPrompt: prompt,
3855
+ schema: rubricEvaluationSchema
3856
+ });
3857
+ const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
3839
3858
  return {
3840
3859
  score,
3860
+ verdict,
3841
3861
  hits,
3842
3862
  misses,
3843
- expectedAspectCount,
3844
- reasoning,
3863
+ expectedAspectCount: rubrics.length,
3864
+ reasoning: data.overall_reasoning,
3845
3865
  evaluatorRawRequest
3846
3866
  };
3847
3867
  }
3868
+ buildRubricPrompt(context, rubrics) {
3869
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3870
+ const parts = [
3871
+ "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3872
+ "",
3873
+ "[[ ## question ## ]]",
3874
+ formattedQuestion,
3875
+ "",
3876
+ "[[ ## expected_outcome ## ]]",
3877
+ context.evalCase.expected_outcome,
3878
+ ""
3879
+ ];
3880
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3881
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3882
+ }
3883
+ parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3884
+ for (const rubric of rubrics) {
3885
+ const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3886
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3887
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3888
+ }
3889
+ parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3890
+ return parts.join("\n");
3891
+ }
3892
+ async runWithRetry(options) {
3893
+ const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
3894
+ let lastError;
3895
+ for (let attempt = 1; attempt <= 3; attempt++) {
3896
+ try {
3897
+ const model = judgeProvider.asLanguageModel?.();
3898
+ if (model) {
3899
+ const { text } = await (0, import_ai2.generateText)({
3900
+ model,
3901
+ system: systemPrompt,
3902
+ prompt: userPrompt,
3903
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
3904
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
3905
+ });
3906
+ const data2 = schema.parse(parseJsonFromText(text));
3907
+ return { data: data2 };
3908
+ }
3909
+ const response = await judgeProvider.invoke({
3910
+ question: userPrompt,
3911
+ systemPrompt,
3912
+ evalCaseId: context.evalCase.id,
3913
+ attempt: context.attempt,
3914
+ maxOutputTokens: this.maxOutputTokens,
3915
+ temperature: this.temperature
3916
+ });
3917
+ const data = schema.parse(parseJsonFromText(response.text ?? ""));
3918
+ return { data, providerResponse: response };
3919
+ } catch (e) {
3920
+ lastError = e instanceof Error ? e : new Error(String(e));
3921
+ }
3922
+ }
3923
+ throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
3924
+ }
3848
3925
  };
3849
3926
  function buildOutputSchema() {
3850
3927
  return [
@@ -3858,6 +3935,29 @@ function buildOutputSchema() {
3858
3935
  "}"
3859
3936
  ].join("\n");
3860
3937
  }
3938
+ function buildRubricOutputSchema() {
3939
+ return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
3940
+ You must return a valid JSON object matching this schema:
3941
+ {
3942
+ "checks": [
3943
+ {
3944
+ "id": "string (rubric id)",
3945
+ "satisfied": boolean,
3946
+ "reasoning": "string (brief explanation)"
3947
+ }
3948
+ ],
3949
+ "overall_reasoning": "string (summary)"
3950
+ }`;
3951
+ }
3952
+ function scoreToVerdict(score) {
3953
+ if (score >= 0.8) {
3954
+ return "pass";
3955
+ }
3956
+ if (score >= 0.6) {
3957
+ return "borderline";
3958
+ }
3959
+ return "fail";
3960
+ }
3861
3961
  function clampScore(value) {
3862
3962
  if (Number.isNaN(value) || !Number.isFinite(value)) {
3863
3963
  return 0;
@@ -3870,71 +3970,15 @@ function clampScore(value) {
3870
3970
  }
3871
3971
  return value;
3872
3972
  }
3873
- function parseQualityResponse(response) {
3874
- const text = typeof response.text === "string" ? response.text.trim() : "";
3875
- if (text.length === 0) {
3876
- return {};
3877
- }
3878
- const direct = attemptParseJson(text);
3879
- if (direct && validateQualityJson(direct)) {
3880
- return direct;
3881
- }
3882
- const extracted = extractJsonBlob(text);
3883
- if (extracted) {
3884
- const parsed = attemptParseJson(extracted);
3885
- if (parsed && validateQualityJson(parsed)) {
3886
- return parsed;
3887
- }
3888
- }
3889
- return {};
3890
- }
3891
- function attemptParseJson(text) {
3892
- try {
3893
- const parsed = JSON.parse(text);
3894
- const score = typeof parsed.score === "number" ? parsed.score : void 0;
3895
- const hits = parsed.hits;
3896
- const misses = parsed.misses;
3897
- const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
3898
- return { score, hits, misses, reasoning };
3899
- } catch {
3900
- return void 0;
3901
- }
3902
- }
3903
- function validateQualityJson(parsed) {
3904
- if (typeof parsed.score !== "number") {
3905
- return false;
3906
- }
3907
- if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
3908
- return false;
3909
- }
3910
- if (parsed.score < 0 || parsed.score > 1) {
3911
- return false;
3912
- }
3913
- if (parsed.hits !== void 0) {
3914
- if (!Array.isArray(parsed.hits)) {
3915
- return false;
3916
- }
3917
- if (!parsed.hits.every((item) => typeof item === "string")) {
3918
- return false;
3919
- }
3920
- }
3921
- if (parsed.misses !== void 0) {
3922
- if (!Array.isArray(parsed.misses)) {
3923
- return false;
3924
- }
3925
- if (!parsed.misses.every((item) => typeof item === "string")) {
3926
- return false;
3927
- }
3928
- }
3929
- if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
3930
- return false;
3931
- }
3932
- return true;
3933
- }
3934
3973
  function extractJsonBlob(text) {
3935
3974
  const match = text.match(/\{[\s\S]*\}/);
3936
3975
  return match?.[0];
3937
3976
  }
3977
+ function parseJsonFromText(text) {
3978
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
3979
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
3980
+ return JSON.parse(blob);
3981
+ }
3938
3982
  function isNonEmptyString(value) {
3939
3983
  return typeof value === "string" && value.trim().length > 0;
3940
3984
  }
@@ -3971,6 +4015,7 @@ var CodeEvaluator = class {
3971
4015
  const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
3972
4016
  return {
3973
4017
  score,
4018
+ verdict: scoreToVerdict(score),
3974
4019
  hits,
3975
4020
  misses,
3976
4021
  expectedAspectCount: hits.length + misses.length || 1,
@@ -3984,6 +4029,7 @@ var CodeEvaluator = class {
3984
4029
  const message = error instanceof Error ? error.message : String(error);
3985
4030
  return {
3986
4031
  score: 0,
4032
+ verdict: "fail",
3987
4033
  hits: [],
3988
4034
  misses: [`Code evaluator failed: ${message}`],
3989
4035
  expectedAspectCount: 1,
@@ -3997,6 +4043,33 @@ var CodeEvaluator = class {
3997
4043
  }
3998
4044
  }
3999
4045
  };
4046
+ function calculateRubricScore(result, rubrics) {
4047
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
4048
+ const hits = [];
4049
+ const misses = [];
4050
+ let totalWeight = 0;
4051
+ let earnedWeight = 0;
4052
+ let failedRequired = false;
4053
+ for (const check of result.checks) {
4054
+ const rubric = rubricMap.get(check.id);
4055
+ if (!rubric) {
4056
+ continue;
4057
+ }
4058
+ totalWeight += rubric.weight;
4059
+ if (check.satisfied) {
4060
+ earnedWeight += rubric.weight;
4061
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
4062
+ } else {
4063
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
4064
+ if (rubric.required) {
4065
+ failedRequired = true;
4066
+ }
4067
+ }
4068
+ }
4069
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
4070
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
4071
+ return { score, verdict, hits, misses };
4072
+ }
4000
4073
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
4001
4074
  const { spawn: spawn2 } = await import("child_process");
4002
4075
  return await new Promise((resolve, reject) => {
@@ -4048,6 +4121,228 @@ function substituteVariables(template, variables) {
4048
4121
  return variables[varName] ?? match;
4049
4122
  });
4050
4123
  }
4124
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
4125
+ {{EVALUATOR_RESULTS_JSON}}
4126
+
4127
+ Decide the final score and verdict based on all evaluator results.
4128
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
4129
+ var CompositeEvaluator = class {
4130
+ kind = "composite";
4131
+ config;
4132
+ evaluatorFactory;
4133
+ cwd;
4134
+ constructor(options) {
4135
+ this.config = options.config;
4136
+ this.evaluatorFactory = options.evaluatorFactory;
4137
+ this.cwd = options.cwd;
4138
+ }
4139
+ async evaluate(context) {
4140
+ const memberResults = await Promise.all(
4141
+ this.config.evaluators.map(async (memberConfig) => {
4142
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
4143
+ return {
4144
+ id: memberConfig.name,
4145
+ type: memberConfig.type,
4146
+ result: await evaluator.evaluate(context)
4147
+ };
4148
+ })
4149
+ );
4150
+ return this.aggregate(memberResults, context);
4151
+ }
4152
+ async aggregate(results, context) {
4153
+ const aggregator = this.config.aggregator;
4154
+ switch (aggregator.type) {
4155
+ case "code_judge":
4156
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
4157
+ case "llm_judge":
4158
+ return this.runLlmAggregator(results, context, aggregator);
4159
+ default:
4160
+ return this.runWeightedAverage(results, aggregator.weights);
4161
+ }
4162
+ }
4163
+ runWeightedAverage(results, weights) {
4164
+ let totalWeight = 0;
4165
+ let weightedSum = 0;
4166
+ const allHits = [];
4167
+ const allMisses = [];
4168
+ const reasoningParts = [];
4169
+ const evaluatorResults = [];
4170
+ for (const member of results) {
4171
+ const weight = weights?.[member.id] ?? 1;
4172
+ totalWeight += weight;
4173
+ weightedSum += member.result.score * weight;
4174
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
4175
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
4176
+ if (member.result.reasoning) {
4177
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
4178
+ }
4179
+ evaluatorResults.push({
4180
+ name: member.id,
4181
+ type: member.type,
4182
+ score: member.result.score,
4183
+ weight,
4184
+ verdict: member.result.verdict,
4185
+ hits: [...member.result.hits],
4186
+ misses: [...member.result.misses],
4187
+ reasoning: member.result.reasoning,
4188
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4189
+ evaluatorResults: member.result.evaluatorResults
4190
+ });
4191
+ }
4192
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
4193
+ return {
4194
+ score: clampScore(finalScore),
4195
+ verdict: scoreToVerdict(finalScore),
4196
+ hits: allHits,
4197
+ misses: allMisses,
4198
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
4199
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
4200
+ evaluatorRawRequest: {
4201
+ aggregator: "weighted_average",
4202
+ ...weights ? { weights } : {}
4203
+ },
4204
+ evaluatorResults
4205
+ };
4206
+ }
4207
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
4208
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
4209
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
4210
+ const evaluatorResults = results.map((member) => ({
4211
+ name: member.id,
4212
+ type: member.type,
4213
+ score: member.result.score,
4214
+ weight: weights?.[member.id] ?? 1,
4215
+ verdict: member.result.verdict,
4216
+ hits: [...member.result.hits],
4217
+ misses: [...member.result.misses],
4218
+ reasoning: member.result.reasoning,
4219
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4220
+ evaluatorResults: member.result.evaluatorResults
4221
+ }));
4222
+ try {
4223
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
4224
+ const parsed = parseJsonSafe(stdout);
4225
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
4226
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
4227
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
4228
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
4229
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
4230
+ return {
4231
+ score,
4232
+ verdict,
4233
+ hits,
4234
+ misses,
4235
+ expectedAspectCount: hits.length + misses.length || 1,
4236
+ reasoning,
4237
+ evaluatorRawRequest: {
4238
+ aggregator: "code_judge",
4239
+ script: scriptPath
4240
+ },
4241
+ evaluatorResults
4242
+ };
4243
+ } catch (error) {
4244
+ const message = error instanceof Error ? error.message : String(error);
4245
+ return {
4246
+ score: 0,
4247
+ verdict: "fail",
4248
+ hits: [],
4249
+ misses: [`Code aggregator failed: ${message}`],
4250
+ expectedAspectCount: 1,
4251
+ reasoning: message,
4252
+ evaluatorRawRequest: {
4253
+ aggregator: "code_judge",
4254
+ script: scriptPath,
4255
+ error: message
4256
+ },
4257
+ evaluatorResults
4258
+ };
4259
+ }
4260
+ }
4261
+ async runLlmAggregator(results, context, config) {
4262
+ const judgeProvider = context.judgeProvider;
4263
+ if (!judgeProvider) {
4264
+ throw new Error("No judge provider available for LLM aggregation");
4265
+ }
4266
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
4267
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
4268
+ const evaluatorResults = results.map((member) => ({
4269
+ name: member.id,
4270
+ type: member.type,
4271
+ score: member.result.score,
4272
+ verdict: member.result.verdict,
4273
+ hits: [...member.result.hits],
4274
+ misses: [...member.result.misses],
4275
+ reasoning: member.result.reasoning,
4276
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4277
+ evaluatorResults: member.result.evaluatorResults
4278
+ }));
4279
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
4280
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
4281
+ const systemPrompt = buildOutputSchema();
4282
+ const evaluatorRawRequest = {
4283
+ aggregator: "llm_judge",
4284
+ userPrompt,
4285
+ systemPrompt,
4286
+ target: judgeProvider.targetName
4287
+ };
4288
+ try {
4289
+ const model = judgeProvider.asLanguageModel?.();
4290
+ if (model) {
4291
+ const { text } = await (0, import_ai2.generateText)({
4292
+ model,
4293
+ system: systemPrompt,
4294
+ prompt: userPrompt
4295
+ });
4296
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
4297
+ const score2 = clampScore(data2.score);
4298
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
4299
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
4300
+ const reasoning2 = data2.reasoning;
4301
+ return {
4302
+ score: score2,
4303
+ verdict: scoreToVerdict(score2),
4304
+ hits: hits2,
4305
+ misses: misses2,
4306
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
4307
+ reasoning: reasoning2,
4308
+ evaluatorRawRequest,
4309
+ evaluatorResults
4310
+ };
4311
+ }
4312
+ const response = await judgeProvider.invoke({
4313
+ question: userPrompt,
4314
+ systemPrompt,
4315
+ evalCaseId: context.evalCase.id,
4316
+ attempt: context.attempt
4317
+ });
4318
+ const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
4319
+ const score = clampScore(data.score);
4320
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4321
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4322
+ const reasoning = data.reasoning ?? response.reasoning;
4323
+ return {
4324
+ score,
4325
+ verdict: scoreToVerdict(score),
4326
+ hits,
4327
+ misses,
4328
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
4329
+ reasoning,
4330
+ evaluatorRawRequest,
4331
+ evaluatorResults
4332
+ };
4333
+ } catch {
4334
+ return {
4335
+ score: 0,
4336
+ verdict: "fail",
4337
+ hits: [],
4338
+ misses: [],
4339
+ expectedAspectCount: 1,
4340
+ evaluatorRawRequest,
4341
+ evaluatorResults
4342
+ };
4343
+ }
4344
+ }
4345
+ };
4051
4346
 
4052
4347
  // src/evaluation/orchestrator.ts
4053
4348
  var import_node_crypto2 = require("crypto");
@@ -4743,7 +5038,6 @@ async function runEvaluatorList(options) {
4743
5038
  reasoning: score2.reasoning,
4744
5039
  evaluator_provider_request: score2.evaluatorRawRequest
4745
5040
  });
4746
- continue;
4747
5041
  }
4748
5042
  if (evaluator.type === "code") {
4749
5043
  const codeEvaluator = new CodeEvaluator({
@@ -4760,10 +5054,10 @@ async function runEvaluatorList(options) {
4760
5054
  promptInputs,
4761
5055
  now
4762
5056
  });
4763
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
5057
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
4764
5058
  evaluatorResults.push({
4765
5059
  name: evaluator.name,
4766
- type: evaluator.type,
5060
+ type: "code_judge",
4767
5061
  score: score2.score,
4768
5062
  verdict: score2.verdict,
4769
5063
  hits: score2.hits,
@@ -4771,19 +5065,37 @@ async function runEvaluatorList(options) {
4771
5065
  reasoning: score2.reasoning,
4772
5066
  evaluator_provider_request: score2.evaluatorRawRequest
4773
5067
  });
4774
- continue;
4775
5068
  }
4776
- if (evaluator.type === "rubric") {
4777
- const rubricEvaluator = new RubricEvaluator({
4778
- config: evaluator,
4779
- resolveJudgeProvider: async (context) => {
4780
- if (context.judgeProvider) {
4781
- return context.judgeProvider;
5069
+ if (evaluator.type === "composite") {
5070
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path13.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
5071
+ const createEvaluator = (memberConfig) => {
5072
+ switch (memberConfig.type) {
5073
+ case "llm_judge":
5074
+ return evaluatorRegistry.llm_judge;
5075
+ case "code":
5076
+ return new CodeEvaluator({
5077
+ script: memberConfig.script,
5078
+ cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
5079
+ agentTimeoutMs
5080
+ });
5081
+ case "composite":
5082
+ return new CompositeEvaluator({
5083
+ config: memberConfig,
5084
+ cwd: evalFileDir,
5085
+ evaluatorFactory: { create: createEvaluator }
5086
+ });
5087
+ default: {
5088
+ const unknownConfig = memberConfig;
5089
+ throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
4782
5090
  }
4783
- return judgeProvider;
4784
5091
  }
5092
+ };
5093
+ const compositeEvaluator = new CompositeEvaluator({
5094
+ config: evaluator,
5095
+ cwd: evalFileDir,
5096
+ evaluatorFactory: { create: createEvaluator }
4785
5097
  });
4786
- const score2 = await rubricEvaluator.evaluate({
5098
+ const score2 = await compositeEvaluator.evaluate({
4787
5099
  evalCase,
4788
5100
  candidate,
4789
5101
  target,
@@ -4802,27 +5114,31 @@ async function runEvaluatorList(options) {
4802
5114
  hits: score2.hits,
4803
5115
  misses: score2.misses,
4804
5116
  reasoning: score2.reasoning,
4805
- evaluator_provider_request: score2.evaluatorRawRequest
5117
+ evaluator_provider_request: score2.evaluatorRawRequest,
5118
+ evaluator_results: mapChildResults(score2.evaluatorResults)
4806
5119
  });
4807
5120
  }
4808
5121
  } catch (error) {
4809
5122
  const message = error instanceof Error ? error.message : String(error);
4810
5123
  const fallbackScore = {
4811
5124
  score: 0,
5125
+ verdict: "fail",
4812
5126
  hits: [],
4813
5127
  misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
4814
5128
  expectedAspectCount: 1,
4815
5129
  reasoning: message
4816
5130
  };
5131
+ const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
4817
5132
  scored.push({
4818
5133
  score: fallbackScore,
4819
5134
  name: evaluator.name ?? "unknown",
4820
- type: evaluator.type ?? "unknown"
5135
+ type: resultType ?? "llm_judge"
4821
5136
  });
4822
5137
  evaluatorResults.push({
4823
5138
  name: evaluator.name ?? "unknown",
4824
- type: evaluator.type ?? "unknown",
5139
+ type: resultType ?? "llm_judge",
4825
5140
  score: 0,
5141
+ verdict: "fail",
4826
5142
  hits: [],
4827
5143
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
4828
5144
  reasoning: message
@@ -4841,6 +5157,7 @@ async function runEvaluatorList(options) {
4841
5157
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
4842
5158
  const score = {
4843
5159
  score: aggregateScore,
5160
+ verdict: scoreToVerdict2(aggregateScore),
4844
5161
  hits,
4845
5162
  misses,
4846
5163
  expectedAspectCount,
@@ -4891,6 +5208,15 @@ async function resolveCustomPrompt(config) {
4891
5208
  function isNonEmptyString2(value) {
4892
5209
  return typeof value === "string" && value.trim().length > 0;
4893
5210
  }
5211
+ function scoreToVerdict2(score) {
5212
+ if (score >= 0.8) {
5213
+ return "pass";
5214
+ }
5215
+ if (score >= 0.6) {
5216
+ return "borderline";
5217
+ }
5218
+ return "fail";
5219
+ }
4894
5220
  function filterEvalCases(evalCases, evalId) {
4895
5221
  if (!evalId) {
4896
5222
  return evalCases;
@@ -5028,6 +5354,23 @@ function isTimeoutLike(error) {
5028
5354
  const value = String(error).toLowerCase();
5029
5355
  return value.includes("timeout");
5030
5356
  }
5357
+ function mapChildResults(children) {
5358
+ if (!children || children.length === 0) {
5359
+ return void 0;
5360
+ }
5361
+ return children.map((child) => ({
5362
+ name: child.name,
5363
+ type: child.type,
5364
+ score: child.score,
5365
+ weight: child.weight,
5366
+ verdict: child.verdict,
5367
+ hits: child.hits,
5368
+ misses: child.misses,
5369
+ reasoning: child.reasoning,
5370
+ evaluator_provider_request: child.evaluatorRawRequest,
5371
+ evaluator_results: mapChildResults(child.evaluatorResults)
5372
+ }));
5373
+ }
5031
5374
 
5032
5375
  // src/evaluation/generators/rubric-generator.ts
5033
5376
  var import_ai3 = require("ai");
@@ -5116,8 +5459,8 @@ function createAgentKernel() {
5116
5459
  // Annotate the CommonJS export names for ESM import in node:
5117
5460
  0 && (module.exports = {
5118
5461
  CodeEvaluator,
5462
+ CompositeEvaluator,
5119
5463
  LlmJudgeEvaluator,
5120
- RubricEvaluator,
5121
5464
  TEST_MESSAGE_ROLES,
5122
5465
  buildDirectoryChain,
5123
5466
  buildPromptInputs,