@agentv/core 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,6 +32,7 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
35
+ RubricEvaluator: () => RubricEvaluator,
35
36
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
36
37
  buildDirectoryChain: () => buildDirectoryChain2,
37
38
  buildPromptInputs: () => buildPromptInputs,
@@ -43,6 +44,7 @@ __export(index_exports, {
43
44
  extractCodeBlocks: () => extractCodeBlocks,
44
45
  fileExists: () => fileExists2,
45
46
  findGitRoot: () => findGitRoot,
47
+ generateRubrics: () => generateRubrics,
46
48
  getHitCount: () => getHitCount,
47
49
  isEvaluatorKind: () => isEvaluatorKind,
48
50
  isGuidelineFile: () => isGuidelineFile,
@@ -106,7 +108,7 @@ function isTestMessage(value) {
106
108
  }
107
109
  return candidate.content.every(isJsonObject);
108
110
  }
109
- var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
111
+ var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
110
112
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
111
113
  function isEvaluatorKind(value) {
112
114
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -508,6 +510,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
508
510
  }
509
511
  }
510
512
  const _model = asString2(rawEvaluator.model);
513
+ if (typeValue === "rubric") {
514
+ const rubrics = rawEvaluator.rubrics;
515
+ if (!Array.isArray(rubrics)) {
516
+ logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
517
+ continue;
518
+ }
519
+ const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
520
+ id: asString2(rubric.id) ?? `rubric-${index + 1}`,
521
+ description: asString2(rubric.description) ?? "",
522
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
523
+ required: typeof rubric.required === "boolean" ? rubric.required : true
524
+ })).filter((r) => r.description.length > 0);
525
+ if (parsedRubrics.length === 0) {
526
+ logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
527
+ continue;
528
+ }
529
+ evaluators.push({
530
+ name,
531
+ type: "rubric",
532
+ rubrics: parsedRubrics
533
+ });
534
+ continue;
535
+ }
511
536
  evaluators.push({
512
537
  name,
513
538
  type: "llm_judge",
@@ -988,7 +1013,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
988
1013
  continue;
989
1014
  }
990
1015
  const conversationId = asString5(evalcase.conversation_id);
991
- const outcome = asString5(evalcase.outcome);
1016
+ const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
992
1017
  const inputMessagesValue = evalcase.input_messages;
993
1018
  const expectedMessagesValue = evalcase.expected_messages;
994
1019
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
@@ -1042,6 +1067,33 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1042
1067
  logError(`Skipping eval case '${id}': ${message}`);
1043
1068
  continue;
1044
1069
  }
1070
+ const inlineRubrics = evalcase.rubrics;
1071
+ if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1072
+ const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
1073
+ if (typeof rubric === "string") {
1074
+ return {
1075
+ id: `rubric-${index + 1}`,
1076
+ description: rubric,
1077
+ weight: 1,
1078
+ required: true
1079
+ };
1080
+ }
1081
+ return {
1082
+ id: asString5(rubric.id) ?? `rubric-${index + 1}`,
1083
+ description: asString5(rubric.description) ?? "",
1084
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
1085
+ required: typeof rubric.required === "boolean" ? rubric.required : true
1086
+ };
1087
+ }).filter((r) => r.description.length > 0);
1088
+ if (rubricItems.length > 0) {
1089
+ const rubricEvaluator = {
1090
+ name: "rubric",
1091
+ type: "rubric",
1092
+ rubrics: rubricItems
1093
+ };
1094
+ evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1095
+ }
1096
+ }
1045
1097
  const userFilePaths = [];
1046
1098
  for (const segment of inputSegments) {
1047
1099
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -1251,6 +1303,9 @@ var AzureProvider = class {
1251
1303
  retryConfig: this.retryConfig
1252
1304
  });
1253
1305
  }
1306
+ asLanguageModel() {
1307
+ return this.model;
1308
+ }
1254
1309
  };
1255
1310
  var AnthropicProvider = class {
1256
1311
  constructor(targetName, config) {
@@ -1284,6 +1339,9 @@ var AnthropicProvider = class {
1284
1339
  providerOptions
1285
1340
  });
1286
1341
  }
1342
+ asLanguageModel() {
1343
+ return this.model;
1344
+ }
1287
1345
  };
1288
1346
  var GeminiProvider = class {
1289
1347
  constructor(targetName, config) {
@@ -1314,6 +1372,9 @@ var GeminiProvider = class {
1314
1372
  retryConfig: this.retryConfig
1315
1373
  });
1316
1374
  }
1375
+ asLanguageModel() {
1376
+ return this.model;
1377
+ }
1317
1378
  };
1318
1379
  function buildAzureOptions(config) {
1319
1380
  const options = {
@@ -3189,38 +3250,38 @@ var import_node_path11 = __toESM(require("path"), 1);
3189
3250
  var import_subagent = require("subagent");
3190
3251
 
3191
3252
  // src/evaluation/providers/vscode-templates.ts
3192
- var AGENTV_REQUEST_TEMPLATE = `[[ ## system_instructions ## ]]
3253
+ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
3193
3254
 
3194
- **IMPORTANT**: You are processing an evaluation request from AgentV. Follow these exact steps:
3255
+ {{userQuery}}
3256
+
3257
+ [[ ## system_instructions ## ]]
3195
3258
 
3196
- 1. Read and understand all guideline files and attachments provided above
3197
- 2. Process the user query below according to the guidelines
3198
- 3. Write your complete response to: {{responseFileTmp}}
3199
- 4. When completely finished, run these PowerShell commands to signal completion:
3200
- \`\`\`powershell
3259
+ **IMPORTANT**: Follow these exact steps:
3260
+ 1. Create and write your complete response to: {{responseFileTmp}}
3261
+ - Do NOT create any additional output files in the workspace.
3262
+ - All intended file outputs/changes MUST be written in your response file.
3263
+ - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3264
+ 2. When completely finished, run these PowerShell commands to signal completion:
3265
+ \`\`\`
3201
3266
  Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
3202
- if (Test-Path subagent.lock) { Remove-Item subagent.lock }
3267
+ if (Test-Path subagent.lock) { del subagent.lock }
3203
3268
  \`\`\`
3204
3269
 
3205
- Do not proceed to step 4 until your response is completely written to the temporary file.
3206
-
3207
- [[ ## task ## ]]
3208
-
3209
- {{userQuery}}
3270
+ Do not proceed to step 2 until your response is completely written to the temporary file.
3210
3271
  `;
3211
- var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## system_instructions ## ]]
3212
-
3213
- **IMPORTANT**: You are processing a batch evaluation request from AgentV. Follow these exact steps:
3272
+ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
3214
3273
 
3215
- 1. Read and understand all guideline files and attachments provided above
3216
- 2. Process the user query below according to the guidelines
3217
- 3. Write your complete response to: {{responseFileTmp}}
3218
- 4. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
3219
- 5. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
3274
+ {{userQuery}}
3220
3275
 
3221
- [[ ## task ## ]]
3276
+ [[ ## system_instructions ## ]]
3222
3277
 
3223
- {{userQuery}}
3278
+ **IMPORTANT**: Follow these exact steps:
3279
+ 1. Create and write your complete response to: {{responseFileTmp}}
3280
+ - Do NOT create any additional output files in the workspace.
3281
+ - All intended file outputs/changes MUST be written in your response file.
3282
+ - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
3283
+ 2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
3284
+ 3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
3224
3285
  `;
3225
3286
 
3226
3287
  // src/evaluation/providers/vscode.ts
@@ -3560,6 +3621,148 @@ function resolveAndCreateProvider(definition, env = process.env) {
3560
3621
  return createProvider(resolved);
3561
3622
  }
3562
3623
 
3624
+ // src/evaluation/evaluators/rubric-evaluator.ts
3625
+ var import_ai2 = require("ai");
3626
+ var import_zod2 = require("zod");
3627
+ var rubricCheckResultSchema = import_zod2.z.object({
3628
+ id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
3629
+ satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
3630
+ reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
3631
+ });
3632
+ var rubricEvaluationSchema = import_zod2.z.object({
3633
+ checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
3634
+ overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
3635
+ });
3636
+ var RubricEvaluator = class {
3637
+ kind = "rubric";
3638
+ config;
3639
+ resolveJudgeProvider;
3640
+ constructor(options) {
3641
+ this.config = options.config;
3642
+ this.resolveJudgeProvider = options.resolveJudgeProvider;
3643
+ }
3644
+ async evaluate(context) {
3645
+ const judgeProvider = await this.resolveJudgeProvider(context);
3646
+ if (!judgeProvider) {
3647
+ throw new Error("No judge provider available for rubric evaluation");
3648
+ }
3649
+ if (!this.config.rubrics || this.config.rubrics.length === 0) {
3650
+ throw new Error(
3651
+ `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
3652
+ );
3653
+ }
3654
+ const prompt = this.buildPrompt(context, this.config.rubrics);
3655
+ const model = judgeProvider.asLanguageModel?.();
3656
+ if (!model) {
3657
+ throw new Error("Judge provider does not support language model interface");
3658
+ }
3659
+ const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
3660
+ You must return a valid JSON object matching this schema:
3661
+ {
3662
+ "checks": [
3663
+ {
3664
+ "id": "string (rubric id)",
3665
+ "satisfied": boolean,
3666
+ "reasoning": "string (brief explanation)"
3667
+ }
3668
+ ],
3669
+ "overall_reasoning": "string (summary)"
3670
+ }`;
3671
+ let result;
3672
+ let lastError;
3673
+ for (let attempt = 1; attempt <= 3; attempt++) {
3674
+ try {
3675
+ const { text } = await (0, import_ai2.generateText)({
3676
+ model,
3677
+ system,
3678
+ prompt
3679
+ });
3680
+ const cleaned = text.replace(/```json\n?|```/g, "").trim();
3681
+ result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
3682
+ break;
3683
+ } catch (e) {
3684
+ lastError = e instanceof Error ? e : new Error(String(e));
3685
+ }
3686
+ }
3687
+ if (!result) {
3688
+ throw new Error(
3689
+ `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
3690
+ );
3691
+ }
3692
+ const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
3693
+ return {
3694
+ score,
3695
+ verdict,
3696
+ hits,
3697
+ misses,
3698
+ expectedAspectCount: this.config.rubrics.length,
3699
+ reasoning: result.overall_reasoning,
3700
+ evaluatorRawRequest: {
3701
+ prompt
3702
+ }
3703
+ };
3704
+ }
3705
+ buildPrompt(context, rubrics) {
3706
+ const parts = [
3707
+ "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3708
+ "",
3709
+ "[[ ## question ## ]]",
3710
+ context.evalCase.question,
3711
+ "",
3712
+ "[[ ## expected_outcome ## ]]",
3713
+ context.evalCase.expected_outcome,
3714
+ ""
3715
+ ];
3716
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3717
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3718
+ }
3719
+ parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3720
+ for (const rubric of rubrics) {
3721
+ const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3722
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3723
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3724
+ }
3725
+ parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3726
+ return parts.join("\n");
3727
+ }
3728
+ calculateScore(result, rubrics) {
3729
+ const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
3730
+ const hits = [];
3731
+ const misses = [];
3732
+ let totalWeight = 0;
3733
+ let earnedWeight = 0;
3734
+ let failedRequired = false;
3735
+ for (const check of result.checks) {
3736
+ const rubric = rubricMap.get(check.id);
3737
+ if (!rubric) {
3738
+ continue;
3739
+ }
3740
+ totalWeight += rubric.weight;
3741
+ if (check.satisfied) {
3742
+ earnedWeight += rubric.weight;
3743
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3744
+ } else {
3745
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3746
+ if (rubric.required) {
3747
+ failedRequired = true;
3748
+ }
3749
+ }
3750
+ }
3751
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
3752
+ let verdict;
3753
+ if (failedRequired) {
3754
+ verdict = "fail";
3755
+ } else if (score >= 0.8) {
3756
+ verdict = "pass";
3757
+ } else if (score >= 0.6) {
3758
+ verdict = "borderline";
3759
+ } else {
3760
+ verdict = "fail";
3761
+ }
3762
+ return { score, verdict, hits, misses };
3763
+ }
3764
+ };
3765
+
3563
3766
  // src/evaluation/evaluators.ts
3564
3767
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3565
3768
 
@@ -4534,6 +4737,7 @@ async function runEvaluatorList(options) {
4534
4737
  name: evaluator.name,
4535
4738
  type: evaluator.type,
4536
4739
  score: score2.score,
4740
+ verdict: score2.verdict,
4537
4741
  hits: score2.hits,
4538
4742
  misses: score2.misses,
4539
4743
  reasoning: score2.reasoning,
@@ -4561,6 +4765,40 @@ async function runEvaluatorList(options) {
4561
4765
  name: evaluator.name,
4562
4766
  type: evaluator.type,
4563
4767
  score: score2.score,
4768
+ verdict: score2.verdict,
4769
+ hits: score2.hits,
4770
+ misses: score2.misses,
4771
+ reasoning: score2.reasoning,
4772
+ evaluator_provider_request: score2.evaluatorRawRequest
4773
+ });
4774
+ continue;
4775
+ }
4776
+ if (evaluator.type === "rubric") {
4777
+ const rubricEvaluator = new RubricEvaluator({
4778
+ config: evaluator,
4779
+ resolveJudgeProvider: async (context) => {
4780
+ if (context.judgeProvider) {
4781
+ return context.judgeProvider;
4782
+ }
4783
+ return judgeProvider;
4784
+ }
4785
+ });
4786
+ const score2 = await rubricEvaluator.evaluate({
4787
+ evalCase,
4788
+ candidate,
4789
+ target,
4790
+ provider,
4791
+ attempt,
4792
+ promptInputs,
4793
+ now,
4794
+ judgeProvider
4795
+ });
4796
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4797
+ evaluatorResults.push({
4798
+ name: evaluator.name,
4799
+ type: evaluator.type,
4800
+ score: score2.score,
4801
+ verdict: score2.verdict,
4564
4802
  hits: score2.hits,
4565
4803
  misses: score2.misses,
4566
4804
  reasoning: score2.reasoning,
@@ -4791,6 +5029,86 @@ function isTimeoutLike(error) {
4791
5029
  return value.includes("timeout");
4792
5030
  }
4793
5031
 
5032
+ // src/evaluation/generators/rubric-generator.ts
5033
+ var import_ai3 = require("ai");
5034
+ var import_zod3 = require("zod");
5035
+ var rubricItemSchema = import_zod3.z.object({
5036
+ id: import_zod3.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
5037
+ description: import_zod3.z.string().describe("What this rubric checks for"),
5038
+ weight: import_zod3.z.number().default(1).describe("Relative importance (default 1.0)"),
5039
+ required: import_zod3.z.boolean().default(true).describe("Whether this is a mandatory requirement")
5040
+ });
5041
+ var rubricGenerationSchema = import_zod3.z.object({
5042
+ rubrics: import_zod3.z.array(rubricItemSchema).describe("List of evaluation rubrics")
5043
+ });
5044
+ async function generateRubrics(options) {
5045
+ const { expectedOutcome, question, referenceAnswer, provider } = options;
5046
+ const prompt = buildPrompt(expectedOutcome, question, referenceAnswer);
5047
+ const model = provider.asLanguageModel?.();
5048
+ if (!model) {
5049
+ throw new Error("Provider does not support language model interface");
5050
+ }
5051
+ const system = `You are an expert at creating evaluation rubrics.
5052
+ You must return a valid JSON object matching this schema:
5053
+ {
5054
+ "rubrics": [
5055
+ {
5056
+ "id": "string (short identifier)",
5057
+ "description": "string (what to check)",
5058
+ "weight": number (default 1.0),
5059
+ "required": boolean (default true)
5060
+ }
5061
+ ]
5062
+ }`;
5063
+ let result;
5064
+ let lastError;
5065
+ for (let attempt = 1; attempt <= 3; attempt++) {
5066
+ try {
5067
+ const { text } = await (0, import_ai3.generateText)({
5068
+ model,
5069
+ system,
5070
+ prompt
5071
+ });
5072
+ const cleaned = text.replace(/```json\n?|```/g, "").trim();
5073
+ result = rubricGenerationSchema.parse(JSON.parse(cleaned));
5074
+ break;
5075
+ } catch (e) {
5076
+ lastError = e instanceof Error ? e : new Error(String(e));
5077
+ }
5078
+ }
5079
+ if (!result) {
5080
+ throw new Error(`Failed to parse generated rubrics after 3 attempts: ${lastError?.message}`);
5081
+ }
5082
+ return result.rubrics;
5083
+ }
5084
+ function buildPrompt(expectedOutcome, question, referenceAnswer) {
5085
+ const parts = [
5086
+ "You are an expert at creating evaluation rubrics.",
5087
+ "Given the expected outcome (and optionally the question and reference answer),",
5088
+ "generate a list of specific, measurable rubric items to evaluate whether an answer meets the expected outcome.",
5089
+ "",
5090
+ "Each rubric should:",
5091
+ "- Be specific and testable",
5092
+ "- Have a short, descriptive ID",
5093
+ "- Include a clear description of what to check",
5094
+ "- Indicate if it is required (mandatory) or optional",
5095
+ "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
5096
+ "",
5097
+ "Generate 3-7 rubric items that comprehensively cover the expected outcome.",
5098
+ "",
5099
+ "[[ ## expected_outcome ## ]]",
5100
+ expectedOutcome,
5101
+ ""
5102
+ ];
5103
+ if (question && question.trim().length > 0) {
5104
+ parts.push("[[ ## question ## ]]", question, "");
5105
+ }
5106
+ if (referenceAnswer && referenceAnswer.trim().length > 0) {
5107
+ parts.push("[[ ## reference_answer ## ]]", referenceAnswer, "");
5108
+ }
5109
+ return parts.join("\n");
5110
+ }
5111
+
4794
5112
  // src/index.ts
4795
5113
  function createAgentKernel() {
4796
5114
  return { status: "stub" };
@@ -4799,6 +5117,7 @@ function createAgentKernel() {
4799
5117
  0 && (module.exports = {
4800
5118
  CodeEvaluator,
4801
5119
  LlmJudgeEvaluator,
5120
+ RubricEvaluator,
4802
5121
  TEST_MESSAGE_ROLES,
4803
5122
  buildDirectoryChain,
4804
5123
  buildPromptInputs,
@@ -4810,6 +5129,7 @@ function createAgentKernel() {
4810
5129
  extractCodeBlocks,
4811
5130
  fileExists,
4812
5131
  findGitRoot,
5132
+ generateRubrics,
4813
5133
  getHitCount,
4814
5134
  isEvaluatorKind,
4815
5135
  isGuidelineFile,