npm - @agentv/core - Versions diffs - 0.21.0 → 0.22.0 - Mend

@agentv/core 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/{chunk-SVY324GN.js → chunk-BO7KG7JX.js} +1 -1
package/dist/{chunk-SVY324GN.js.map → chunk-BO7KG7JX.js.map} +1 -1
package/dist/evaluation/validation/index.cjs +4 -4
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +5 -5
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +322 -2
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +49 -3
package/dist/index.d.ts +49 -3
package/dist/index.js +321 -3
package/dist/index.js.map +1 -1
package/package.json +2 -5

package/dist/index.cjs CHANGED Viewed

@@ -32,6 +32,7 @@ var index_exports = {};
 __export(index_exports, {
   CodeEvaluator: () => CodeEvaluator,
   LlmJudgeEvaluator: () => LlmJudgeEvaluator,
+  RubricEvaluator: () => RubricEvaluator,
   TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
   buildDirectoryChain: () => buildDirectoryChain2,
   buildPromptInputs: () => buildPromptInputs,
@@ -43,6 +44,7 @@ __export(index_exports, {
   extractCodeBlocks: () => extractCodeBlocks,
   fileExists: () => fileExists2,
   findGitRoot: () => findGitRoot,
+  generateRubrics: () => generateRubrics,
   getHitCount: () => getHitCount,
   isEvaluatorKind: () => isEvaluatorKind,
   isGuidelineFile: () => isGuidelineFile,
@@ -106,7 +108,7 @@ function isTestMessage(value) {
   }
   return candidate.content.every(isJsonObject);
 }
-var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
+var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
   return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -508,6 +510,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       }
     }
     const _model = asString2(rawEvaluator.model);
+    if (typeValue === "rubric") {
+      const rubrics = rawEvaluator.rubrics;
+      if (!Array.isArray(rubrics)) {
+        logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
+        continue;
+      }
+      const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
+        id: asString2(rubric.id) ?? `rubric-${index + 1}`,
+        description: asString2(rubric.description) ?? "",
+        weight: typeof rubric.weight === "number" ? rubric.weight : 1,
+        required: typeof rubric.required === "boolean" ? rubric.required : true
+      })).filter((r) => r.description.length > 0);
+      if (parsedRubrics.length === 0) {
+        logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
+        continue;
+      }
+      evaluators.push({
+        name,
+        type: "rubric",
+        rubrics: parsedRubrics
+      });
+      continue;
+    }
     evaluators.push({
       name,
       type: "llm_judge",
@@ -988,7 +1013,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       continue;
     }
     const conversationId = asString5(evalcase.conversation_id);
-    const outcome = asString5(evalcase.outcome);
+    const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
     const inputMessagesValue = evalcase.input_messages;
     const expectedMessagesValue = evalcase.expected_messages;
     if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
@@ -1042,6 +1067,33 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       logError(`Skipping eval case '${id}': ${message}`);
       continue;
     }
+    const inlineRubrics = evalcase.rubrics;
+    if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
+      const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
+        if (typeof rubric === "string") {
+          return {
+            id: `rubric-${index + 1}`,
+            description: rubric,
+            weight: 1,
+            required: true
+          };
+        }
+        return {
+          id: asString5(rubric.id) ?? `rubric-${index + 1}`,
+          description: asString5(rubric.description) ?? "",
+          weight: typeof rubric.weight === "number" ? rubric.weight : 1,
+          required: typeof rubric.required === "boolean" ? rubric.required : true
+        };
+      }).filter((r) => r.description.length > 0);
+      if (rubricItems.length > 0) {
+        const rubricEvaluator = {
+          name: "rubric",
+          type: "rubric",
+          rubrics: rubricItems
+        };
+        evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
+      }
+    }
     const userFilePaths = [];
     for (const segment of inputSegments) {
       if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -1251,6 +1303,9 @@ var AzureProvider = class {
       retryConfig: this.retryConfig
     });
   }
+  asLanguageModel() {
+    return this.model;
+  }
 };
 var AnthropicProvider = class {
   constructor(targetName, config) {
@@ -1284,6 +1339,9 @@ var AnthropicProvider = class {
       providerOptions
     });
   }
+  asLanguageModel() {
+    return this.model;
+  }
 };
 var GeminiProvider = class {
   constructor(targetName, config) {
@@ -1314,6 +1372,9 @@ var GeminiProvider = class {
       retryConfig: this.retryConfig
     });
   }
+  asLanguageModel() {
+    return this.model;
+  }
 };
 function buildAzureOptions(config) {
   const options = {
@@ -3560,6 +3621,148 @@ function resolveAndCreateProvider(definition, env = process.env) {
   return createProvider(resolved);
 }
+// src/evaluation/evaluators/rubric-evaluator.ts
+var import_ai2 = require("ai");
+var import_zod2 = require("zod");
+var rubricCheckResultSchema = import_zod2.z.object({
+  id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
+  satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
+  reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
+});
+var rubricEvaluationSchema = import_zod2.z.object({
+  checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
+  overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
+});
+var RubricEvaluator = class {
+  kind = "rubric";
+  config;
+  resolveJudgeProvider;
+  constructor(options) {
+    this.config = options.config;
+    this.resolveJudgeProvider = options.resolveJudgeProvider;
+  }
+  async evaluate(context) {
+    const judgeProvider = await this.resolveJudgeProvider(context);
+    if (!judgeProvider) {
+      throw new Error("No judge provider available for rubric evaluation");
+    }
+    if (!this.config.rubrics || this.config.rubrics.length === 0) {
+      throw new Error(
+        `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
+      );
+    }
+    const prompt = this.buildPrompt(context, this.config.rubrics);
+    const model = judgeProvider.asLanguageModel?.();
+    if (!model) {
+      throw new Error("Judge provider does not support language model interface");
+    }
+    const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
+You must return a valid JSON object matching this schema:
+{
+  "checks": [
+    {
+      "id": "string (rubric id)",
+      "satisfied": boolean,
+      "reasoning": "string (brief explanation)"
+    }
+  ],
+  "overall_reasoning": "string (summary)"
+}`;
+    let result;
+    let lastError;
+    for (let attempt = 1; attempt <= 3; attempt++) {
+      try {
+        const { text } = await (0, import_ai2.generateText)({
+          model,
+          system,
+          prompt
+        });
+        const cleaned = text.replace(/```json\n?|```/g, "").trim();
+        result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
+        break;
+      } catch (e) {
+        lastError = e instanceof Error ? e : new Error(String(e));
+      }
+    }
+    if (!result) {
+      throw new Error(
+        `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
+      );
+    }
+    const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
+    return {
+      score,
+      verdict,
+      hits,
+      misses,
+      expectedAspectCount: this.config.rubrics.length,
+      reasoning: result.overall_reasoning,
+      evaluatorRawRequest: {
+        prompt
+      }
+    };
+  }
+  buildPrompt(context, rubrics) {
+    const parts = [
+      "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
+      "",
+      "[[ ## question ## ]]",
+      context.evalCase.question,
+      "",
+      "[[ ## expected_outcome ## ]]",
+      context.evalCase.expected_outcome,
+      ""
+    ];
+    if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
+      parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
+    }
+    parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
+    for (const rubric of rubrics) {
+      const requiredLabel = rubric.required ? " (REQUIRED)" : "";
+      const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
+      parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
+    }
+    parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
+    return parts.join("\n");
+  }
+  calculateScore(result, rubrics) {
+    const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
+    const hits = [];
+    const misses = [];
+    let totalWeight = 0;
+    let earnedWeight = 0;
+    let failedRequired = false;
+    for (const check of result.checks) {
+      const rubric = rubricMap.get(check.id);
+      if (!rubric) {
+        continue;
+      }
+      totalWeight += rubric.weight;
+      if (check.satisfied) {
+        earnedWeight += rubric.weight;
+        hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      } else {
+        misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+        if (rubric.required) {
+          failedRequired = true;
+        }
+      }
+    }
+    const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
+    let verdict;
+    if (failedRequired) {
+      verdict = "fail";
+    } else if (score >= 0.8) {
+      verdict = "pass";
+    } else if (score >= 0.6) {
+      verdict = "borderline";
+    } else {
+      verdict = "fail";
+    }
+    return { score, verdict, hits, misses };
+  }
+};
 // src/evaluation/evaluators.ts
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
@@ -4534,6 +4737,7 @@ async function runEvaluatorList(options) {
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
@@ -4561,6 +4765,40 @@ async function runEvaluatorList(options) {
           name: evaluator.name,
           type: evaluator.type,
           score: score2.score,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning,
+          evaluator_provider_request: score2.evaluatorRawRequest
+        });
+        continue;
+      }
+      if (evaluator.type === "rubric") {
+        const rubricEvaluator = new RubricEvaluator({
+          config: evaluator,
+          resolveJudgeProvider: async (context) => {
+            if (context.judgeProvider) {
+              return context.judgeProvider;
+            }
+            return judgeProvider;
+          }
+        });
+        const score2 = await rubricEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          judgeProvider
+        });
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          verdict: score2.verdict,
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
@@ -4791,6 +5029,86 @@ function isTimeoutLike(error) {
   return value.includes("timeout");
 }
+// src/evaluation/generators/rubric-generator.ts
+var import_ai3 = require("ai");
+var import_zod3 = require("zod");
+var rubricItemSchema = import_zod3.z.object({
+  id: import_zod3.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
+  description: import_zod3.z.string().describe("What this rubric checks for"),
+  weight: import_zod3.z.number().default(1).describe("Relative importance (default 1.0)"),
+  required: import_zod3.z.boolean().default(true).describe("Whether this is a mandatory requirement")
+});
+var rubricGenerationSchema = import_zod3.z.object({
+  rubrics: import_zod3.z.array(rubricItemSchema).describe("List of evaluation rubrics")
+});
+async function generateRubrics(options) {
+  const { expectedOutcome, question, referenceAnswer, provider } = options;
+  const prompt = buildPrompt(expectedOutcome, question, referenceAnswer);
+  const model = provider.asLanguageModel?.();
+  if (!model) {
+    throw new Error("Provider does not support language model interface");
+  }
+  const system = `You are an expert at creating evaluation rubrics.
+You must return a valid JSON object matching this schema:
+{
+  "rubrics": [
+    {
+      "id": "string (short identifier)",
+      "description": "string (what to check)",
+      "weight": number (default 1.0),
+      "required": boolean (default true)
+    }
+  ]
+}`;
+  let result;
+  let lastError;
+  for (let attempt = 1; attempt <= 3; attempt++) {
+    try {
+      const { text } = await (0, import_ai3.generateText)({
+        model,
+        system,
+        prompt
+      });
+      const cleaned = text.replace(/```json\n?|```/g, "").trim();
+      result = rubricGenerationSchema.parse(JSON.parse(cleaned));
+      break;
+    } catch (e) {
+      lastError = e instanceof Error ? e : new Error(String(e));
+    }
+  }
+  if (!result) {
+    throw new Error(`Failed to parse generated rubrics after 3 attempts: ${lastError?.message}`);
+  }
+  return result.rubrics;
+}
+function buildPrompt(expectedOutcome, question, referenceAnswer) {
+  const parts = [
+    "You are an expert at creating evaluation rubrics.",
+    "Given the expected outcome (and optionally the question and reference answer),",
+    "generate a list of specific, measurable rubric items to evaluate whether an answer meets the expected outcome.",
+    "",
+    "Each rubric should:",
+    "- Be specific and testable",
+    "- Have a short, descriptive ID",
+    "- Include a clear description of what to check",
+    "- Indicate if it is required (mandatory) or optional",
+    "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
+    "",
+    "Generate 3-7 rubric items that comprehensively cover the expected outcome.",
+    "",
+    "[[ ## expected_outcome ## ]]",
+    expectedOutcome,
+    ""
+  ];
+  if (question && question.trim().length > 0) {
+    parts.push("[[ ## question ## ]]", question, "");
+  }
+  if (referenceAnswer && referenceAnswer.trim().length > 0) {
+    parts.push("[[ ## reference_answer ## ]]", referenceAnswer, "");
+  }
+  return parts.join("\n");
+}
 // src/index.ts
 function createAgentKernel() {
   return { status: "stub" };
@@ -4799,6 +5117,7 @@ function createAgentKernel() {
 0 && (module.exports = {
   CodeEvaluator,
   LlmJudgeEvaluator,
+  RubricEvaluator,
   TEST_MESSAGE_ROLES,
   buildDirectoryChain,
   buildPromptInputs,
@@ -4810,6 +5129,7 @@ function createAgentKernel() {
   extractCodeBlocks,
   fileExists,
   findGitRoot,
+  generateRubrics,
   getHitCount,
   isEvaluatorKind,
   isGuidelineFile,