@agentv/core 0.22.0 → 0.22.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,7 +32,6 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
35
- RubricEvaluator: () => RubricEvaluator,
36
35
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
36
  buildDirectoryChain: () => buildDirectoryChain2,
38
37
  buildPromptInputs: () => buildPromptInputs,
@@ -510,25 +509,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
510
509
  }
511
510
  }
512
511
  const _model = asString2(rawEvaluator.model);
512
+ const rawRubrics = rawEvaluator.rubrics;
513
+ const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
514
+ id: asString2(rubric.id) ?? `rubric-${index + 1}`,
515
+ description: asString2(rubric.description) ?? "",
516
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
517
+ required: typeof rubric.required === "boolean" ? rubric.required : true
518
+ })).filter((r) => r.description.length > 0) : void 0;
513
519
  if (typeValue === "rubric") {
514
- const rubrics = rawEvaluator.rubrics;
515
- if (!Array.isArray(rubrics)) {
520
+ if (!parsedRubrics) {
516
521
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
517
522
  continue;
518
523
  }
519
- const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
520
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
521
- description: asString2(rubric.description) ?? "",
522
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
523
- required: typeof rubric.required === "boolean" ? rubric.required : true
524
- })).filter((r) => r.description.length > 0);
525
524
  if (parsedRubrics.length === 0) {
526
525
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
527
526
  continue;
528
527
  }
529
528
  evaluators.push({
530
529
  name,
531
- type: "rubric",
530
+ type: "llm_judge",
532
531
  rubrics: parsedRubrics
533
532
  });
534
533
  continue;
@@ -537,7 +536,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
537
536
  name,
538
537
  type: "llm_judge",
539
538
  prompt,
540
- promptPath
539
+ promptPath,
540
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
541
541
  });
542
542
  }
543
543
  return evaluators.length > 0 ? evaluators : void 0;
@@ -1088,7 +1088,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1088
1088
  if (rubricItems.length > 0) {
1089
1089
  const rubricEvaluator = {
1090
1090
  name: "rubric",
1091
- type: "rubric",
1091
+ type: "llm_judge",
1092
1092
  rubrics: rubricItems
1093
1093
  };
1094
1094
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
@@ -3621,149 +3621,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
3621
3621
  return createProvider(resolved);
3622
3622
  }
3623
3623
 
3624
- // src/evaluation/evaluators/rubric-evaluator.ts
3624
+ // src/evaluation/evaluators.ts
3625
3625
  var import_ai2 = require("ai");
3626
3626
  var import_zod2 = require("zod");
3627
- var rubricCheckResultSchema = import_zod2.z.object({
3628
- id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
3629
- satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
3630
- reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
3631
- });
3632
- var rubricEvaluationSchema = import_zod2.z.object({
3633
- checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
3634
- overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
3635
- });
3636
- var RubricEvaluator = class {
3637
- kind = "rubric";
3638
- config;
3639
- resolveJudgeProvider;
3640
- constructor(options) {
3641
- this.config = options.config;
3642
- this.resolveJudgeProvider = options.resolveJudgeProvider;
3643
- }
3644
- async evaluate(context) {
3645
- const judgeProvider = await this.resolveJudgeProvider(context);
3646
- if (!judgeProvider) {
3647
- throw new Error("No judge provider available for rubric evaluation");
3648
- }
3649
- if (!this.config.rubrics || this.config.rubrics.length === 0) {
3650
- throw new Error(
3651
- `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
3652
- );
3653
- }
3654
- const prompt = this.buildPrompt(context, this.config.rubrics);
3655
- const model = judgeProvider.asLanguageModel?.();
3656
- if (!model) {
3657
- throw new Error("Judge provider does not support language model interface");
3658
- }
3659
- const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
3660
- You must return a valid JSON object matching this schema:
3661
- {
3662
- "checks": [
3663
- {
3664
- "id": "string (rubric id)",
3665
- "satisfied": boolean,
3666
- "reasoning": "string (brief explanation)"
3667
- }
3668
- ],
3669
- "overall_reasoning": "string (summary)"
3670
- }`;
3671
- let result;
3672
- let lastError;
3673
- for (let attempt = 1; attempt <= 3; attempt++) {
3674
- try {
3675
- const { text } = await (0, import_ai2.generateText)({
3676
- model,
3677
- system,
3678
- prompt
3679
- });
3680
- const cleaned = text.replace(/```json\n?|```/g, "").trim();
3681
- result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
3682
- break;
3683
- } catch (e) {
3684
- lastError = e instanceof Error ? e : new Error(String(e));
3685
- }
3686
- }
3687
- if (!result) {
3688
- throw new Error(
3689
- `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
3690
- );
3691
- }
3692
- const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
3693
- return {
3694
- score,
3695
- verdict,
3696
- hits,
3697
- misses,
3698
- expectedAspectCount: this.config.rubrics.length,
3699
- reasoning: result.overall_reasoning,
3700
- evaluatorRawRequest: {
3701
- prompt
3702
- }
3703
- };
3704
- }
3705
- buildPrompt(context, rubrics) {
3706
- const parts = [
3707
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3708
- "",
3709
- "[[ ## question ## ]]",
3710
- context.evalCase.question,
3711
- "",
3712
- "[[ ## expected_outcome ## ]]",
3713
- context.evalCase.expected_outcome,
3714
- ""
3715
- ];
3716
- if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3717
- parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3718
- }
3719
- parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3720
- for (const rubric of rubrics) {
3721
- const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3722
- const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3723
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3724
- }
3725
- parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3726
- return parts.join("\n");
3727
- }
3728
- calculateScore(result, rubrics) {
3729
- const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
3730
- const hits = [];
3731
- const misses = [];
3732
- let totalWeight = 0;
3733
- let earnedWeight = 0;
3734
- let failedRequired = false;
3735
- for (const check of result.checks) {
3736
- const rubric = rubricMap.get(check.id);
3737
- if (!rubric) {
3738
- continue;
3739
- }
3740
- totalWeight += rubric.weight;
3741
- if (check.satisfied) {
3742
- earnedWeight += rubric.weight;
3743
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3744
- } else {
3745
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3746
- if (rubric.required) {
3747
- failedRequired = true;
3748
- }
3749
- }
3750
- }
3751
- const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
3752
- let verdict;
3753
- if (failedRequired) {
3754
- verdict = "fail";
3755
- } else if (score >= 0.8) {
3756
- verdict = "pass";
3757
- } else if (score >= 0.6) {
3758
- verdict = "borderline";
3759
- } else {
3760
- verdict = "fail";
3761
- }
3762
- return { score, verdict, hits, misses };
3763
- }
3764
- };
3765
-
3766
- // src/evaluation/evaluators.ts
3767
3627
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3768
3628
 
3769
3629
  Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -3781,6 +3641,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
3781
3641
 
3782
3642
  [[ ## candidate_answer ## ]]
3783
3643
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
3644
+ var freeformEvaluationSchema = import_zod2.z.object({
3645
+ score: import_zod2.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
3646
+ hits: import_zod2.z.array(import_zod2.z.string()).describe("Brief specific achievements").optional(),
3647
+ misses: import_zod2.z.array(import_zod2.z.string()).describe("Brief failures or omissions").optional(),
3648
+ reasoning: import_zod2.z.string().describe("Concise explanation (1-2 sentences)").optional()
3649
+ });
3650
+ var rubricCheckResultSchema = import_zod2.z.object({
3651
+ id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
3652
+ satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
3653
+ reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
3654
+ });
3655
+ var rubricEvaluationSchema = import_zod2.z.object({
3656
+ checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
3657
+ overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
3658
+ });
3784
3659
  var LlmJudgeEvaluator = class {
3785
3660
  kind = "llm_judge";
3786
3661
  resolveJudgeProvider;
@@ -3798,9 +3673,13 @@ var LlmJudgeEvaluator = class {
3798
3673
  if (!judgeProvider) {
3799
3674
  throw new Error("No judge provider available for LLM grading");
3800
3675
  }
3801
- return this.evaluateWithPrompt(context, judgeProvider);
3676
+ const config = context.evaluator;
3677
+ if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
3678
+ return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
3679
+ }
3680
+ return this.evaluateFreeform(context, judgeProvider);
3802
3681
  }
3803
- async evaluateWithPrompt(context, judgeProvider) {
3682
+ async evaluateFreeform(context, judgeProvider) {
3804
3683
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3805
3684
  const variables = {
3806
3685
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
@@ -3817,34 +3696,132 @@ var LlmJudgeEvaluator = class {
3817
3696
  const systemPrompt = buildOutputSchema();
3818
3697
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
3819
3698
  const userPrompt = substituteVariables(evaluatorTemplate, variables);
3820
- const response = await judgeProvider.invoke({
3821
- question: userPrompt,
3822
- systemPrompt,
3823
- evalCaseId: context.evalCase.id,
3824
- attempt: context.attempt,
3825
- maxOutputTokens: this.maxOutputTokens,
3826
- temperature: this.temperature
3827
- });
3828
- const parsed = parseQualityResponse(response);
3829
- const score = clampScore(parsed.score ?? 0);
3830
- const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
3831
- const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
3832
- const reasoning = parsed.reasoning ?? response.reasoning;
3833
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3834
3699
  const evaluatorRawRequest = {
3835
3700
  userPrompt,
3836
3701
  systemPrompt,
3837
3702
  target: judgeProvider.targetName
3838
3703
  };
3704
+ try {
3705
+ const { data, providerResponse } = await this.runWithRetry({
3706
+ context,
3707
+ judgeProvider,
3708
+ systemPrompt,
3709
+ userPrompt,
3710
+ schema: freeformEvaluationSchema
3711
+ });
3712
+ const score = clampScore(data.score);
3713
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
3714
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
3715
+ const reasoning = data.reasoning ?? providerResponse?.reasoning;
3716
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3717
+ return {
3718
+ score,
3719
+ verdict: scoreToVerdict(score),
3720
+ hits,
3721
+ misses,
3722
+ expectedAspectCount,
3723
+ reasoning,
3724
+ evaluatorRawRequest
3725
+ };
3726
+ } catch {
3727
+ return {
3728
+ score: 0,
3729
+ verdict: "fail",
3730
+ hits: [],
3731
+ misses: [],
3732
+ expectedAspectCount: 1,
3733
+ evaluatorRawRequest
3734
+ };
3735
+ }
3736
+ }
3737
+ async evaluateWithRubrics(context, judgeProvider, rubrics) {
3738
+ if (!rubrics || rubrics.length === 0) {
3739
+ throw new Error(
3740
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
3741
+ );
3742
+ }
3743
+ const prompt = this.buildRubricPrompt(context, rubrics);
3744
+ const systemPrompt = buildRubricOutputSchema();
3745
+ const evaluatorRawRequest = {
3746
+ userPrompt: prompt,
3747
+ systemPrompt,
3748
+ target: judgeProvider.targetName
3749
+ };
3750
+ const { data } = await this.runWithRetry({
3751
+ context,
3752
+ judgeProvider,
3753
+ systemPrompt,
3754
+ userPrompt: prompt,
3755
+ schema: rubricEvaluationSchema
3756
+ });
3757
+ const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
3839
3758
  return {
3840
3759
  score,
3760
+ verdict,
3841
3761
  hits,
3842
3762
  misses,
3843
- expectedAspectCount,
3844
- reasoning,
3763
+ expectedAspectCount: rubrics.length,
3764
+ reasoning: data.overall_reasoning,
3845
3765
  evaluatorRawRequest
3846
3766
  };
3847
3767
  }
3768
+ buildRubricPrompt(context, rubrics) {
3769
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3770
+ const parts = [
3771
+ "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3772
+ "",
3773
+ "[[ ## question ## ]]",
3774
+ formattedQuestion,
3775
+ "",
3776
+ "[[ ## expected_outcome ## ]]",
3777
+ context.evalCase.expected_outcome,
3778
+ ""
3779
+ ];
3780
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3781
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3782
+ }
3783
+ parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3784
+ for (const rubric of rubrics) {
3785
+ const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3786
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3787
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3788
+ }
3789
+ parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3790
+ return parts.join("\n");
3791
+ }
3792
+ async runWithRetry(options) {
3793
+ const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
3794
+ let lastError;
3795
+ for (let attempt = 1; attempt <= 3; attempt++) {
3796
+ try {
3797
+ const model = judgeProvider.asLanguageModel?.();
3798
+ if (model) {
3799
+ const { text } = await (0, import_ai2.generateText)({
3800
+ model,
3801
+ system: systemPrompt,
3802
+ prompt: userPrompt,
3803
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
3804
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
3805
+ });
3806
+ const data2 = schema.parse(parseJsonFromText(text));
3807
+ return { data: data2 };
3808
+ }
3809
+ const response = await judgeProvider.invoke({
3810
+ question: userPrompt,
3811
+ systemPrompt,
3812
+ evalCaseId: context.evalCase.id,
3813
+ attempt: context.attempt,
3814
+ maxOutputTokens: this.maxOutputTokens,
3815
+ temperature: this.temperature
3816
+ });
3817
+ const data = schema.parse(parseJsonFromText(response.text ?? ""));
3818
+ return { data, providerResponse: response };
3819
+ } catch (e) {
3820
+ lastError = e instanceof Error ? e : new Error(String(e));
3821
+ }
3822
+ }
3823
+ throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
3824
+ }
3848
3825
  };
3849
3826
  function buildOutputSchema() {
3850
3827
  return [
@@ -3858,6 +3835,29 @@ function buildOutputSchema() {
3858
3835
  "}"
3859
3836
  ].join("\n");
3860
3837
  }
3838
+ function buildRubricOutputSchema() {
3839
+ return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
3840
+ You must return a valid JSON object matching this schema:
3841
+ {
3842
+ "checks": [
3843
+ {
3844
+ "id": "string (rubric id)",
3845
+ "satisfied": boolean,
3846
+ "reasoning": "string (brief explanation)"
3847
+ }
3848
+ ],
3849
+ "overall_reasoning": "string (summary)"
3850
+ }`;
3851
+ }
3852
+ function scoreToVerdict(score) {
3853
+ if (score >= 0.8) {
3854
+ return "pass";
3855
+ }
3856
+ if (score >= 0.6) {
3857
+ return "borderline";
3858
+ }
3859
+ return "fail";
3860
+ }
3861
3861
  function clampScore(value) {
3862
3862
  if (Number.isNaN(value) || !Number.isFinite(value)) {
3863
3863
  return 0;
@@ -3870,71 +3870,15 @@ function clampScore(value) {
3870
3870
  }
3871
3871
  return value;
3872
3872
  }
3873
- function parseQualityResponse(response) {
3874
- const text = typeof response.text === "string" ? response.text.trim() : "";
3875
- if (text.length === 0) {
3876
- return {};
3877
- }
3878
- const direct = attemptParseJson(text);
3879
- if (direct && validateQualityJson(direct)) {
3880
- return direct;
3881
- }
3882
- const extracted = extractJsonBlob(text);
3883
- if (extracted) {
3884
- const parsed = attemptParseJson(extracted);
3885
- if (parsed && validateQualityJson(parsed)) {
3886
- return parsed;
3887
- }
3888
- }
3889
- return {};
3890
- }
3891
- function attemptParseJson(text) {
3892
- try {
3893
- const parsed = JSON.parse(text);
3894
- const score = typeof parsed.score === "number" ? parsed.score : void 0;
3895
- const hits = parsed.hits;
3896
- const misses = parsed.misses;
3897
- const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
3898
- return { score, hits, misses, reasoning };
3899
- } catch {
3900
- return void 0;
3901
- }
3902
- }
3903
- function validateQualityJson(parsed) {
3904
- if (typeof parsed.score !== "number") {
3905
- return false;
3906
- }
3907
- if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
3908
- return false;
3909
- }
3910
- if (parsed.score < 0 || parsed.score > 1) {
3911
- return false;
3912
- }
3913
- if (parsed.hits !== void 0) {
3914
- if (!Array.isArray(parsed.hits)) {
3915
- return false;
3916
- }
3917
- if (!parsed.hits.every((item) => typeof item === "string")) {
3918
- return false;
3919
- }
3920
- }
3921
- if (parsed.misses !== void 0) {
3922
- if (!Array.isArray(parsed.misses)) {
3923
- return false;
3924
- }
3925
- if (!parsed.misses.every((item) => typeof item === "string")) {
3926
- return false;
3927
- }
3928
- }
3929
- if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
3930
- return false;
3931
- }
3932
- return true;
3933
- }
3934
3873
  function extractJsonBlob(text) {
3935
3874
  const match = text.match(/\{[\s\S]*\}/);
3936
3875
  return match?.[0];
3937
3876
  }
3877
+ function parseJsonFromText(text) {
3878
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
3879
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
3880
+ return JSON.parse(blob);
3881
+ }
3938
3882
  function isNonEmptyString(value) {
3939
3883
  return typeof value === "string" && value.trim().length > 0;
3940
3884
  }
@@ -3971,6 +3915,7 @@ var CodeEvaluator = class {
3971
3915
  const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
3972
3916
  return {
3973
3917
  score,
3918
+ verdict: scoreToVerdict(score),
3974
3919
  hits,
3975
3920
  misses,
3976
3921
  expectedAspectCount: hits.length + misses.length || 1,
@@ -3984,6 +3929,7 @@ var CodeEvaluator = class {
3984
3929
  const message = error instanceof Error ? error.message : String(error);
3985
3930
  return {
3986
3931
  score: 0,
3932
+ verdict: "fail",
3987
3933
  hits: [],
3988
3934
  misses: [`Code evaluator failed: ${message}`],
3989
3935
  expectedAspectCount: 1,
@@ -3997,6 +3943,33 @@ var CodeEvaluator = class {
3997
3943
  }
3998
3944
  }
3999
3945
  };
3946
+ function calculateRubricScore(result, rubrics) {
3947
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
3948
+ const hits = [];
3949
+ const misses = [];
3950
+ let totalWeight = 0;
3951
+ let earnedWeight = 0;
3952
+ let failedRequired = false;
3953
+ for (const check of result.checks) {
3954
+ const rubric = rubricMap.get(check.id);
3955
+ if (!rubric) {
3956
+ continue;
3957
+ }
3958
+ totalWeight += rubric.weight;
3959
+ if (check.satisfied) {
3960
+ earnedWeight += rubric.weight;
3961
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3962
+ } else {
3963
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3964
+ if (rubric.required) {
3965
+ failedRequired = true;
3966
+ }
3967
+ }
3968
+ }
3969
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
3970
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
3971
+ return { score, verdict, hits, misses };
3972
+ }
4000
3973
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
4001
3974
  const { spawn: spawn2 } = await import("child_process");
4002
3975
  return await new Promise((resolve, reject) => {
@@ -4743,7 +4716,6 @@ async function runEvaluatorList(options) {
4743
4716
  reasoning: score2.reasoning,
4744
4717
  evaluator_provider_request: score2.evaluatorRawRequest
4745
4718
  });
4746
- continue;
4747
4719
  }
4748
4720
  if (evaluator.type === "code") {
4749
4721
  const codeEvaluator = new CodeEvaluator({
@@ -4771,44 +4743,12 @@ async function runEvaluatorList(options) {
4771
4743
  reasoning: score2.reasoning,
4772
4744
  evaluator_provider_request: score2.evaluatorRawRequest
4773
4745
  });
4774
- continue;
4775
- }
4776
- if (evaluator.type === "rubric") {
4777
- const rubricEvaluator = new RubricEvaluator({
4778
- config: evaluator,
4779
- resolveJudgeProvider: async (context) => {
4780
- if (context.judgeProvider) {
4781
- return context.judgeProvider;
4782
- }
4783
- return judgeProvider;
4784
- }
4785
- });
4786
- const score2 = await rubricEvaluator.evaluate({
4787
- evalCase,
4788
- candidate,
4789
- target,
4790
- provider,
4791
- attempt,
4792
- promptInputs,
4793
- now,
4794
- judgeProvider
4795
- });
4796
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4797
- evaluatorResults.push({
4798
- name: evaluator.name,
4799
- type: evaluator.type,
4800
- score: score2.score,
4801
- verdict: score2.verdict,
4802
- hits: score2.hits,
4803
- misses: score2.misses,
4804
- reasoning: score2.reasoning,
4805
- evaluator_provider_request: score2.evaluatorRawRequest
4806
- });
4807
4746
  }
4808
4747
  } catch (error) {
4809
4748
  const message = error instanceof Error ? error.message : String(error);
4810
4749
  const fallbackScore = {
4811
4750
  score: 0,
4751
+ verdict: "fail",
4812
4752
  hits: [],
4813
4753
  misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
4814
4754
  expectedAspectCount: 1,
@@ -4823,6 +4763,7 @@ async function runEvaluatorList(options) {
4823
4763
  name: evaluator.name ?? "unknown",
4824
4764
  type: evaluator.type ?? "unknown",
4825
4765
  score: 0,
4766
+ verdict: "fail",
4826
4767
  hits: [],
4827
4768
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
4828
4769
  reasoning: message
@@ -4841,6 +4782,7 @@ async function runEvaluatorList(options) {
4841
4782
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
4842
4783
  const score = {
4843
4784
  score: aggregateScore,
4785
+ verdict: scoreToVerdict2(aggregateScore),
4844
4786
  hits,
4845
4787
  misses,
4846
4788
  expectedAspectCount,
@@ -4891,6 +4833,15 @@ async function resolveCustomPrompt(config) {
4891
4833
  function isNonEmptyString2(value) {
4892
4834
  return typeof value === "string" && value.trim().length > 0;
4893
4835
  }
4836
+ function scoreToVerdict2(score) {
4837
+ if (score >= 0.8) {
4838
+ return "pass";
4839
+ }
4840
+ if (score >= 0.6) {
4841
+ return "borderline";
4842
+ }
4843
+ return "fail";
4844
+ }
4894
4845
  function filterEvalCases(evalCases, evalId) {
4895
4846
  if (!evalId) {
4896
4847
  return evalCases;
@@ -5117,7 +5068,6 @@ function createAgentKernel() {
5117
5068
  0 && (module.exports = {
5118
5069
  CodeEvaluator,
5119
5070
  LlmJudgeEvaluator,
5120
- RubricEvaluator,
5121
5071
  TEST_MESSAGE_ROLES,
5122
5072
  buildDirectoryChain,
5123
5073
  buildPromptInputs,