@agentv/core 0.22.1 → 0.22.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -453,25 +453,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
453
453
  }
454
454
  }
455
455
  const _model = asString2(rawEvaluator.model);
456
+ const rawRubrics = rawEvaluator.rubrics;
457
+ const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
458
+ id: asString2(rubric.id) ?? `rubric-${index + 1}`,
459
+ description: asString2(rubric.description) ?? "",
460
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
461
+ required: typeof rubric.required === "boolean" ? rubric.required : true
462
+ })).filter((r) => r.description.length > 0) : void 0;
456
463
  if (typeValue === "rubric") {
457
- const rubrics = rawEvaluator.rubrics;
458
- if (!Array.isArray(rubrics)) {
464
+ if (!parsedRubrics) {
459
465
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
460
466
  continue;
461
467
  }
462
- const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
463
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
464
- description: asString2(rubric.description) ?? "",
465
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
466
- required: typeof rubric.required === "boolean" ? rubric.required : true
467
- })).filter((r) => r.description.length > 0);
468
468
  if (parsedRubrics.length === 0) {
469
469
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
470
470
  continue;
471
471
  }
472
472
  evaluators.push({
473
473
  name,
474
- type: "rubric",
474
+ type: "llm_judge",
475
475
  rubrics: parsedRubrics
476
476
  });
477
477
  continue;
@@ -480,7 +480,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
480
480
  name,
481
481
  type: "llm_judge",
482
482
  prompt,
483
- promptPath
483
+ promptPath,
484
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
484
485
  });
485
486
  }
486
487
  return evaluators.length > 0 ? evaluators : void 0;
@@ -1031,7 +1032,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1031
1032
  if (rubricItems.length > 0) {
1032
1033
  const rubricEvaluator = {
1033
1034
  name: "rubric",
1034
- type: "rubric",
1035
+ type: "llm_judge",
1035
1036
  rubrics: rubricItems
1036
1037
  };
1037
1038
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
@@ -2928,149 +2929,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
2928
2929
  return createProvider(resolved);
2929
2930
  }
2930
2931
 
2931
- // src/evaluation/evaluators/rubric-evaluator.ts
2932
+ // src/evaluation/evaluators.ts
2932
2933
  import { generateText as generateText2 } from "ai";
2933
2934
  import { z } from "zod";
2934
- var rubricCheckResultSchema = z.object({
2935
- id: z.string().describe("The ID of the rubric item being checked"),
2936
- satisfied: z.boolean().describe("Whether this rubric requirement is met"),
2937
- reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
2938
- });
2939
- var rubricEvaluationSchema = z.object({
2940
- checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
2941
- overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
2942
- });
2943
- var RubricEvaluator = class {
2944
- kind = "rubric";
2945
- config;
2946
- resolveJudgeProvider;
2947
- constructor(options) {
2948
- this.config = options.config;
2949
- this.resolveJudgeProvider = options.resolveJudgeProvider;
2950
- }
2951
- async evaluate(context) {
2952
- const judgeProvider = await this.resolveJudgeProvider(context);
2953
- if (!judgeProvider) {
2954
- throw new Error("No judge provider available for rubric evaluation");
2955
- }
2956
- if (!this.config.rubrics || this.config.rubrics.length === 0) {
2957
- throw new Error(
2958
- `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
2959
- );
2960
- }
2961
- const prompt = this.buildPrompt(context, this.config.rubrics);
2962
- const model = judgeProvider.asLanguageModel?.();
2963
- if (!model) {
2964
- throw new Error("Judge provider does not support language model interface");
2965
- }
2966
- const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
2967
- You must return a valid JSON object matching this schema:
2968
- {
2969
- "checks": [
2970
- {
2971
- "id": "string (rubric id)",
2972
- "satisfied": boolean,
2973
- "reasoning": "string (brief explanation)"
2974
- }
2975
- ],
2976
- "overall_reasoning": "string (summary)"
2977
- }`;
2978
- let result;
2979
- let lastError;
2980
- for (let attempt = 1; attempt <= 3; attempt++) {
2981
- try {
2982
- const { text } = await generateText2({
2983
- model,
2984
- system,
2985
- prompt
2986
- });
2987
- const cleaned = text.replace(/```json\n?|```/g, "").trim();
2988
- result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
2989
- break;
2990
- } catch (e) {
2991
- lastError = e instanceof Error ? e : new Error(String(e));
2992
- }
2993
- }
2994
- if (!result) {
2995
- throw new Error(
2996
- `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
2997
- );
2998
- }
2999
- const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
3000
- return {
3001
- score,
3002
- verdict,
3003
- hits,
3004
- misses,
3005
- expectedAspectCount: this.config.rubrics.length,
3006
- reasoning: result.overall_reasoning,
3007
- evaluatorRawRequest: {
3008
- prompt
3009
- }
3010
- };
3011
- }
3012
- buildPrompt(context, rubrics) {
3013
- const parts = [
3014
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3015
- "",
3016
- "[[ ## question ## ]]",
3017
- context.evalCase.question,
3018
- "",
3019
- "[[ ## expected_outcome ## ]]",
3020
- context.evalCase.expected_outcome,
3021
- ""
3022
- ];
3023
- if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3024
- parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3025
- }
3026
- parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3027
- for (const rubric of rubrics) {
3028
- const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3029
- const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3030
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3031
- }
3032
- parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3033
- return parts.join("\n");
3034
- }
3035
- calculateScore(result, rubrics) {
3036
- const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
3037
- const hits = [];
3038
- const misses = [];
3039
- let totalWeight = 0;
3040
- let earnedWeight = 0;
3041
- let failedRequired = false;
3042
- for (const check of result.checks) {
3043
- const rubric = rubricMap.get(check.id);
3044
- if (!rubric) {
3045
- continue;
3046
- }
3047
- totalWeight += rubric.weight;
3048
- if (check.satisfied) {
3049
- earnedWeight += rubric.weight;
3050
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3051
- } else {
3052
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3053
- if (rubric.required) {
3054
- failedRequired = true;
3055
- }
3056
- }
3057
- }
3058
- const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
3059
- let verdict;
3060
- if (failedRequired) {
3061
- verdict = "fail";
3062
- } else if (score >= 0.8) {
3063
- verdict = "pass";
3064
- } else if (score >= 0.6) {
3065
- verdict = "borderline";
3066
- } else {
3067
- verdict = "fail";
3068
- }
3069
- return { score, verdict, hits, misses };
3070
- }
3071
- };
3072
-
3073
- // src/evaluation/evaluators.ts
3074
2935
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3075
2936
 
3076
2937
  Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -3088,6 +2949,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
3088
2949
 
3089
2950
  [[ ## candidate_answer ## ]]
3090
2951
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
2952
+ var freeformEvaluationSchema = z.object({
2953
+ score: z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
2954
+ hits: z.array(z.string()).describe("Brief specific achievements").optional(),
2955
+ misses: z.array(z.string()).describe("Brief failures or omissions").optional(),
2956
+ reasoning: z.string().describe("Concise explanation (1-2 sentences)").optional()
2957
+ });
2958
+ var rubricCheckResultSchema = z.object({
2959
+ id: z.string().describe("The ID of the rubric item being checked"),
2960
+ satisfied: z.boolean().describe("Whether this rubric requirement is met"),
2961
+ reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
2962
+ });
2963
+ var rubricEvaluationSchema = z.object({
2964
+ checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
2965
+ overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
2966
+ });
3091
2967
  var LlmJudgeEvaluator = class {
3092
2968
  kind = "llm_judge";
3093
2969
  resolveJudgeProvider;
@@ -3105,9 +2981,13 @@ var LlmJudgeEvaluator = class {
3105
2981
  if (!judgeProvider) {
3106
2982
  throw new Error("No judge provider available for LLM grading");
3107
2983
  }
3108
- return this.evaluateWithPrompt(context, judgeProvider);
2984
+ const config = context.evaluator;
2985
+ if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
2986
+ return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
2987
+ }
2988
+ return this.evaluateFreeform(context, judgeProvider);
3109
2989
  }
3110
- async evaluateWithPrompt(context, judgeProvider) {
2990
+ async evaluateFreeform(context, judgeProvider) {
3111
2991
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3112
2992
  const variables = {
3113
2993
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
@@ -3124,34 +3004,132 @@ var LlmJudgeEvaluator = class {
3124
3004
  const systemPrompt = buildOutputSchema();
3125
3005
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
3126
3006
  const userPrompt = substituteVariables(evaluatorTemplate, variables);
3127
- const response = await judgeProvider.invoke({
3128
- question: userPrompt,
3129
- systemPrompt,
3130
- evalCaseId: context.evalCase.id,
3131
- attempt: context.attempt,
3132
- maxOutputTokens: this.maxOutputTokens,
3133
- temperature: this.temperature
3134
- });
3135
- const parsed = parseQualityResponse(response);
3136
- const score = clampScore(parsed.score ?? 0);
3137
- const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
3138
- const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
3139
- const reasoning = parsed.reasoning ?? response.reasoning;
3140
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3141
3007
  const evaluatorRawRequest = {
3142
3008
  userPrompt,
3143
3009
  systemPrompt,
3144
3010
  target: judgeProvider.targetName
3145
3011
  };
3012
+ try {
3013
+ const { data, providerResponse } = await this.runWithRetry({
3014
+ context,
3015
+ judgeProvider,
3016
+ systemPrompt,
3017
+ userPrompt,
3018
+ schema: freeformEvaluationSchema
3019
+ });
3020
+ const score = clampScore(data.score);
3021
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
3022
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
3023
+ const reasoning = data.reasoning ?? providerResponse?.reasoning;
3024
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3025
+ return {
3026
+ score,
3027
+ verdict: scoreToVerdict(score),
3028
+ hits,
3029
+ misses,
3030
+ expectedAspectCount,
3031
+ reasoning,
3032
+ evaluatorRawRequest
3033
+ };
3034
+ } catch {
3035
+ return {
3036
+ score: 0,
3037
+ verdict: "fail",
3038
+ hits: [],
3039
+ misses: [],
3040
+ expectedAspectCount: 1,
3041
+ evaluatorRawRequest
3042
+ };
3043
+ }
3044
+ }
3045
+ async evaluateWithRubrics(context, judgeProvider, rubrics) {
3046
+ if (!rubrics || rubrics.length === 0) {
3047
+ throw new Error(
3048
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
3049
+ );
3050
+ }
3051
+ const prompt = this.buildRubricPrompt(context, rubrics);
3052
+ const systemPrompt = buildRubricOutputSchema();
3053
+ const evaluatorRawRequest = {
3054
+ userPrompt: prompt,
3055
+ systemPrompt,
3056
+ target: judgeProvider.targetName
3057
+ };
3058
+ const { data } = await this.runWithRetry({
3059
+ context,
3060
+ judgeProvider,
3061
+ systemPrompt,
3062
+ userPrompt: prompt,
3063
+ schema: rubricEvaluationSchema
3064
+ });
3065
+ const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
3146
3066
  return {
3147
3067
  score,
3068
+ verdict,
3148
3069
  hits,
3149
3070
  misses,
3150
- expectedAspectCount,
3151
- reasoning,
3071
+ expectedAspectCount: rubrics.length,
3072
+ reasoning: data.overall_reasoning,
3152
3073
  evaluatorRawRequest
3153
3074
  };
3154
3075
  }
3076
+ buildRubricPrompt(context, rubrics) {
3077
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3078
+ const parts = [
3079
+ "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3080
+ "",
3081
+ "[[ ## question ## ]]",
3082
+ formattedQuestion,
3083
+ "",
3084
+ "[[ ## expected_outcome ## ]]",
3085
+ context.evalCase.expected_outcome,
3086
+ ""
3087
+ ];
3088
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3089
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3090
+ }
3091
+ parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3092
+ for (const rubric of rubrics) {
3093
+ const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3094
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3095
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3096
+ }
3097
+ parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3098
+ return parts.join("\n");
3099
+ }
3100
+ async runWithRetry(options) {
3101
+ const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
3102
+ let lastError;
3103
+ for (let attempt = 1; attempt <= 3; attempt++) {
3104
+ try {
3105
+ const model = judgeProvider.asLanguageModel?.();
3106
+ if (model) {
3107
+ const { text } = await generateText2({
3108
+ model,
3109
+ system: systemPrompt,
3110
+ prompt: userPrompt,
3111
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
3112
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
3113
+ });
3114
+ const data2 = schema.parse(parseJsonFromText(text));
3115
+ return { data: data2 };
3116
+ }
3117
+ const response = await judgeProvider.invoke({
3118
+ question: userPrompt,
3119
+ systemPrompt,
3120
+ evalCaseId: context.evalCase.id,
3121
+ attempt: context.attempt,
3122
+ maxOutputTokens: this.maxOutputTokens,
3123
+ temperature: this.temperature
3124
+ });
3125
+ const data = schema.parse(parseJsonFromText(response.text ?? ""));
3126
+ return { data, providerResponse: response };
3127
+ } catch (e) {
3128
+ lastError = e instanceof Error ? e : new Error(String(e));
3129
+ }
3130
+ }
3131
+ throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
3132
+ }
3155
3133
  };
3156
3134
  function buildOutputSchema() {
3157
3135
  return [
@@ -3165,6 +3143,29 @@ function buildOutputSchema() {
3165
3143
  "}"
3166
3144
  ].join("\n");
3167
3145
  }
3146
+ function buildRubricOutputSchema() {
3147
+ return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
3148
+ You must return a valid JSON object matching this schema:
3149
+ {
3150
+ "checks": [
3151
+ {
3152
+ "id": "string (rubric id)",
3153
+ "satisfied": boolean,
3154
+ "reasoning": "string (brief explanation)"
3155
+ }
3156
+ ],
3157
+ "overall_reasoning": "string (summary)"
3158
+ }`;
3159
+ }
3160
+ function scoreToVerdict(score) {
3161
+ if (score >= 0.8) {
3162
+ return "pass";
3163
+ }
3164
+ if (score >= 0.6) {
3165
+ return "borderline";
3166
+ }
3167
+ return "fail";
3168
+ }
3168
3169
  function clampScore(value) {
3169
3170
  if (Number.isNaN(value) || !Number.isFinite(value)) {
3170
3171
  return 0;
@@ -3177,71 +3178,15 @@ function clampScore(value) {
3177
3178
  }
3178
3179
  return value;
3179
3180
  }
3180
- function parseQualityResponse(response) {
3181
- const text = typeof response.text === "string" ? response.text.trim() : "";
3182
- if (text.length === 0) {
3183
- return {};
3184
- }
3185
- const direct = attemptParseJson(text);
3186
- if (direct && validateQualityJson(direct)) {
3187
- return direct;
3188
- }
3189
- const extracted = extractJsonBlob(text);
3190
- if (extracted) {
3191
- const parsed = attemptParseJson(extracted);
3192
- if (parsed && validateQualityJson(parsed)) {
3193
- return parsed;
3194
- }
3195
- }
3196
- return {};
3197
- }
3198
- function attemptParseJson(text) {
3199
- try {
3200
- const parsed = JSON.parse(text);
3201
- const score = typeof parsed.score === "number" ? parsed.score : void 0;
3202
- const hits = parsed.hits;
3203
- const misses = parsed.misses;
3204
- const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
3205
- return { score, hits, misses, reasoning };
3206
- } catch {
3207
- return void 0;
3208
- }
3209
- }
3210
- function validateQualityJson(parsed) {
3211
- if (typeof parsed.score !== "number") {
3212
- return false;
3213
- }
3214
- if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
3215
- return false;
3216
- }
3217
- if (parsed.score < 0 || parsed.score > 1) {
3218
- return false;
3219
- }
3220
- if (parsed.hits !== void 0) {
3221
- if (!Array.isArray(parsed.hits)) {
3222
- return false;
3223
- }
3224
- if (!parsed.hits.every((item) => typeof item === "string")) {
3225
- return false;
3226
- }
3227
- }
3228
- if (parsed.misses !== void 0) {
3229
- if (!Array.isArray(parsed.misses)) {
3230
- return false;
3231
- }
3232
- if (!parsed.misses.every((item) => typeof item === "string")) {
3233
- return false;
3234
- }
3235
- }
3236
- if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
3237
- return false;
3238
- }
3239
- return true;
3240
- }
3241
3181
  function extractJsonBlob(text) {
3242
3182
  const match = text.match(/\{[\s\S]*\}/);
3243
3183
  return match?.[0];
3244
3184
  }
3185
+ function parseJsonFromText(text) {
3186
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
3187
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
3188
+ return JSON.parse(blob);
3189
+ }
3245
3190
  function isNonEmptyString(value) {
3246
3191
  return typeof value === "string" && value.trim().length > 0;
3247
3192
  }
@@ -3278,6 +3223,7 @@ var CodeEvaluator = class {
3278
3223
  const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
3279
3224
  return {
3280
3225
  score,
3226
+ verdict: scoreToVerdict(score),
3281
3227
  hits,
3282
3228
  misses,
3283
3229
  expectedAspectCount: hits.length + misses.length || 1,
@@ -3291,6 +3237,7 @@ var CodeEvaluator = class {
3291
3237
  const message = error instanceof Error ? error.message : String(error);
3292
3238
  return {
3293
3239
  score: 0,
3240
+ verdict: "fail",
3294
3241
  hits: [],
3295
3242
  misses: [`Code evaluator failed: ${message}`],
3296
3243
  expectedAspectCount: 1,
@@ -3304,6 +3251,33 @@ var CodeEvaluator = class {
3304
3251
  }
3305
3252
  }
3306
3253
  };
3254
+ function calculateRubricScore(result, rubrics) {
3255
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
3256
+ const hits = [];
3257
+ const misses = [];
3258
+ let totalWeight = 0;
3259
+ let earnedWeight = 0;
3260
+ let failedRequired = false;
3261
+ for (const check of result.checks) {
3262
+ const rubric = rubricMap.get(check.id);
3263
+ if (!rubric) {
3264
+ continue;
3265
+ }
3266
+ totalWeight += rubric.weight;
3267
+ if (check.satisfied) {
3268
+ earnedWeight += rubric.weight;
3269
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3270
+ } else {
3271
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3272
+ if (rubric.required) {
3273
+ failedRequired = true;
3274
+ }
3275
+ }
3276
+ }
3277
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
3278
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
3279
+ return { score, verdict, hits, misses };
3280
+ }
3307
3281
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
3308
3282
  const { spawn: spawn2 } = await import("node:child_process");
3309
3283
  return await new Promise((resolve, reject) => {
@@ -4040,7 +4014,6 @@ async function runEvaluatorList(options) {
4040
4014
  reasoning: score2.reasoning,
4041
4015
  evaluator_provider_request: score2.evaluatorRawRequest
4042
4016
  });
4043
- continue;
4044
4017
  }
4045
4018
  if (evaluator.type === "code") {
4046
4019
  const codeEvaluator = new CodeEvaluator({
@@ -4068,44 +4041,12 @@ async function runEvaluatorList(options) {
4068
4041
  reasoning: score2.reasoning,
4069
4042
  evaluator_provider_request: score2.evaluatorRawRequest
4070
4043
  });
4071
- continue;
4072
- }
4073
- if (evaluator.type === "rubric") {
4074
- const rubricEvaluator = new RubricEvaluator({
4075
- config: evaluator,
4076
- resolveJudgeProvider: async (context) => {
4077
- if (context.judgeProvider) {
4078
- return context.judgeProvider;
4079
- }
4080
- return judgeProvider;
4081
- }
4082
- });
4083
- const score2 = await rubricEvaluator.evaluate({
4084
- evalCase,
4085
- candidate,
4086
- target,
4087
- provider,
4088
- attempt,
4089
- promptInputs,
4090
- now,
4091
- judgeProvider
4092
- });
4093
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4094
- evaluatorResults.push({
4095
- name: evaluator.name,
4096
- type: evaluator.type,
4097
- score: score2.score,
4098
- verdict: score2.verdict,
4099
- hits: score2.hits,
4100
- misses: score2.misses,
4101
- reasoning: score2.reasoning,
4102
- evaluator_provider_request: score2.evaluatorRawRequest
4103
- });
4104
4044
  }
4105
4045
  } catch (error) {
4106
4046
  const message = error instanceof Error ? error.message : String(error);
4107
4047
  const fallbackScore = {
4108
4048
  score: 0,
4049
+ verdict: "fail",
4109
4050
  hits: [],
4110
4051
  misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
4111
4052
  expectedAspectCount: 1,
@@ -4120,6 +4061,7 @@ async function runEvaluatorList(options) {
4120
4061
  name: evaluator.name ?? "unknown",
4121
4062
  type: evaluator.type ?? "unknown",
4122
4063
  score: 0,
4064
+ verdict: "fail",
4123
4065
  hits: [],
4124
4066
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
4125
4067
  reasoning: message
@@ -4138,6 +4080,7 @@ async function runEvaluatorList(options) {
4138
4080
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
4139
4081
  const score = {
4140
4082
  score: aggregateScore,
4083
+ verdict: scoreToVerdict2(aggregateScore),
4141
4084
  hits,
4142
4085
  misses,
4143
4086
  expectedAspectCount,
@@ -4188,6 +4131,15 @@ async function resolveCustomPrompt(config) {
4188
4131
  function isNonEmptyString2(value) {
4189
4132
  return typeof value === "string" && value.trim().length > 0;
4190
4133
  }
4134
+ function scoreToVerdict2(score) {
4135
+ if (score >= 0.8) {
4136
+ return "pass";
4137
+ }
4138
+ if (score >= 0.6) {
4139
+ return "borderline";
4140
+ }
4141
+ return "fail";
4142
+ }
4191
4143
  function filterEvalCases(evalCases, evalId) {
4192
4144
  if (!evalId) {
4193
4145
  return evalCases;
@@ -4413,7 +4365,6 @@ function createAgentKernel() {
4413
4365
  export {
4414
4366
  CodeEvaluator,
4415
4367
  LlmJudgeEvaluator,
4416
- RubricEvaluator,
4417
4368
  TEST_MESSAGE_ROLES,
4418
4369
  buildDirectoryChain,
4419
4370
  buildPromptInputs,