@agentv/core 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -8,7 +8,7 @@ import {
8
8
  readTextFile,
9
9
  resolveFileReference,
10
10
  resolveTargetDefinition
11
- } from "./chunk-SVY324GN.js";
11
+ } from "./chunk-BO7KG7JX.js";
12
12
 
13
13
  // src/evaluation/types.ts
14
14
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -51,7 +51,7 @@ function isTestMessage(value) {
51
51
  }
52
52
  return candidate.content.every(isJsonObject);
53
53
  }
54
- var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
54
+ var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
55
55
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
56
56
  function isEvaluatorKind(value) {
57
57
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -453,6 +453,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
453
453
  }
454
454
  }
455
455
  const _model = asString2(rawEvaluator.model);
456
+ if (typeValue === "rubric") {
457
+ const rubrics = rawEvaluator.rubrics;
458
+ if (!Array.isArray(rubrics)) {
459
+ logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
460
+ continue;
461
+ }
462
+ const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
463
+ id: asString2(rubric.id) ?? `rubric-${index + 1}`,
464
+ description: asString2(rubric.description) ?? "",
465
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
466
+ required: typeof rubric.required === "boolean" ? rubric.required : true
467
+ })).filter((r) => r.description.length > 0);
468
+ if (parsedRubrics.length === 0) {
469
+ logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
470
+ continue;
471
+ }
472
+ evaluators.push({
473
+ name,
474
+ type: "rubric",
475
+ rubrics: parsedRubrics
476
+ });
477
+ continue;
478
+ }
456
479
  evaluators.push({
457
480
  name,
458
481
  type: "llm_judge",
@@ -933,7 +956,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
933
956
  continue;
934
957
  }
935
958
  const conversationId = asString5(evalcase.conversation_id);
936
- const outcome = asString5(evalcase.outcome);
959
+ const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
937
960
  const inputMessagesValue = evalcase.input_messages;
938
961
  const expectedMessagesValue = evalcase.expected_messages;
939
962
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
@@ -987,6 +1010,33 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
987
1010
  logError(`Skipping eval case '${id}': ${message}`);
988
1011
  continue;
989
1012
  }
1013
+ const inlineRubrics = evalcase.rubrics;
1014
+ if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
1015
+ const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
1016
+ if (typeof rubric === "string") {
1017
+ return {
1018
+ id: `rubric-${index + 1}`,
1019
+ description: rubric,
1020
+ weight: 1,
1021
+ required: true
1022
+ };
1023
+ }
1024
+ return {
1025
+ id: asString5(rubric.id) ?? `rubric-${index + 1}`,
1026
+ description: asString5(rubric.description) ?? "",
1027
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
1028
+ required: typeof rubric.required === "boolean" ? rubric.required : true
1029
+ };
1030
+ }).filter((r) => r.description.length > 0);
1031
+ if (rubricItems.length > 0) {
1032
+ const rubricEvaluator = {
1033
+ name: "rubric",
1034
+ type: "rubric",
1035
+ rubrics: rubricItems
1036
+ };
1037
+ evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
1038
+ }
1039
+ }
990
1040
  const userFilePaths = [];
991
1041
  for (const segment of inputSegments) {
992
1042
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -1085,6 +1135,9 @@ var AzureProvider = class {
1085
1135
  retryConfig: this.retryConfig
1086
1136
  });
1087
1137
  }
1138
+ asLanguageModel() {
1139
+ return this.model;
1140
+ }
1088
1141
  };
1089
1142
  var AnthropicProvider = class {
1090
1143
  constructor(targetName, config) {
@@ -1118,6 +1171,9 @@ var AnthropicProvider = class {
1118
1171
  providerOptions
1119
1172
  });
1120
1173
  }
1174
+ asLanguageModel() {
1175
+ return this.model;
1176
+ }
1121
1177
  };
1122
1178
  var GeminiProvider = class {
1123
1179
  constructor(targetName, config) {
@@ -1148,6 +1204,9 @@ var GeminiProvider = class {
1148
1204
  retryConfig: this.retryConfig
1149
1205
  });
1150
1206
  }
1207
+ asLanguageModel() {
1208
+ return this.model;
1209
+ }
1151
1210
  };
1152
1211
  function buildAzureOptions(config) {
1153
1212
  const options = {
@@ -2498,38 +2557,38 @@ import {
2498
2557
  } from "subagent";
2499
2558
 
2500
2559
  // src/evaluation/providers/vscode-templates.ts
2501
- var AGENTV_REQUEST_TEMPLATE = `[[ ## system_instructions ## ]]
2560
+ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
2561
+
2562
+ {{userQuery}}
2502
2563
 
2503
- **IMPORTANT**: You are processing an evaluation request from AgentV. Follow these exact steps:
2564
+ [[ ## system_instructions ## ]]
2504
2565
 
2505
- 1. Read and understand all guideline files and attachments provided above
2506
- 2. Process the user query below according to the guidelines
2507
- 3. Write your complete response to: {{responseFileTmp}}
2508
- 4. When completely finished, run these PowerShell commands to signal completion:
2509
- \`\`\`powershell
2566
+ **IMPORTANT**: Follow these exact steps:
2567
+ 1. Create and write your complete response to: {{responseFileTmp}}
2568
+ - Do NOT create any additional output files in the workspace.
2569
+ - All intended file outputs/changes MUST be written in your response file.
2570
+ - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
2571
+ 2. When completely finished, run these PowerShell commands to signal completion:
2572
+ \`\`\`
2510
2573
  Move-Item -LiteralPath '{{responseFileTmp}}' -Destination '{{responseFileFinal}}'
2511
- if (Test-Path subagent.lock) { Remove-Item subagent.lock }
2574
+ if (Test-Path subagent.lock) { del subagent.lock }
2512
2575
  \`\`\`
2513
2576
 
2514
- Do not proceed to step 4 until your response is completely written to the temporary file.
2515
-
2516
- [[ ## task ## ]]
2517
-
2518
- {{userQuery}}
2577
+ Do not proceed to step 2 until your response is completely written to the temporary file.
2519
2578
  `;
2520
- var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## system_instructions ## ]]
2579
+ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
2521
2580
 
2522
- **IMPORTANT**: You are processing a batch evaluation request from AgentV. Follow these exact steps:
2523
-
2524
- 1. Read and understand all guideline files and attachments provided above
2525
- 2. Process the user query below according to the guidelines
2526
- 3. Write your complete response to: {{responseFileTmp}}
2527
- 4. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
2528
- 5. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
2581
+ {{userQuery}}
2529
2582
 
2530
- [[ ## task ## ]]
2583
+ [[ ## system_instructions ## ]]
2531
2584
 
2532
- {{userQuery}}
2585
+ **IMPORTANT**: Follow these exact steps:
2586
+ 1. Create and write your complete response to: {{responseFileTmp}}
2587
+ - Do NOT create any additional output files in the workspace.
2588
+ - All intended file outputs/changes MUST be written in your response file.
2589
+ - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
2590
+ 2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
2591
+ 3. Do not unlock the workspace from this request; batch orchestration will handle unlocking after all responses are ready.
2533
2592
  `;
2534
2593
 
2535
2594
  // src/evaluation/providers/vscode.ts
@@ -2869,6 +2928,148 @@ function resolveAndCreateProvider(definition, env = process.env) {
2869
2928
  return createProvider(resolved);
2870
2929
  }
2871
2930
 
2931
+ // src/evaluation/evaluators/rubric-evaluator.ts
2932
+ import { generateText as generateText2 } from "ai";
2933
+ import { z } from "zod";
2934
+ var rubricCheckResultSchema = z.object({
2935
+ id: z.string().describe("The ID of the rubric item being checked"),
2936
+ satisfied: z.boolean().describe("Whether this rubric requirement is met"),
2937
+ reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
2938
+ });
2939
+ var rubricEvaluationSchema = z.object({
2940
+ checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
2941
+ overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
2942
+ });
2943
+ var RubricEvaluator = class {
2944
+ kind = "rubric";
2945
+ config;
2946
+ resolveJudgeProvider;
2947
+ constructor(options) {
2948
+ this.config = options.config;
2949
+ this.resolveJudgeProvider = options.resolveJudgeProvider;
2950
+ }
2951
+ async evaluate(context) {
2952
+ const judgeProvider = await this.resolveJudgeProvider(context);
2953
+ if (!judgeProvider) {
2954
+ throw new Error("No judge provider available for rubric evaluation");
2955
+ }
2956
+ if (!this.config.rubrics || this.config.rubrics.length === 0) {
2957
+ throw new Error(
2958
+ `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
2959
+ );
2960
+ }
2961
+ const prompt = this.buildPrompt(context, this.config.rubrics);
2962
+ const model = judgeProvider.asLanguageModel?.();
2963
+ if (!model) {
2964
+ throw new Error("Judge provider does not support language model interface");
2965
+ }
2966
+ const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
2967
+ You must return a valid JSON object matching this schema:
2968
+ {
2969
+ "checks": [
2970
+ {
2971
+ "id": "string (rubric id)",
2972
+ "satisfied": boolean,
2973
+ "reasoning": "string (brief explanation)"
2974
+ }
2975
+ ],
2976
+ "overall_reasoning": "string (summary)"
2977
+ }`;
2978
+ let result;
2979
+ let lastError;
2980
+ for (let attempt = 1; attempt <= 3; attempt++) {
2981
+ try {
2982
+ const { text } = await generateText2({
2983
+ model,
2984
+ system,
2985
+ prompt
2986
+ });
2987
+ const cleaned = text.replace(/```json\n?|```/g, "").trim();
2988
+ result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
2989
+ break;
2990
+ } catch (e) {
2991
+ lastError = e instanceof Error ? e : new Error(String(e));
2992
+ }
2993
+ }
2994
+ if (!result) {
2995
+ throw new Error(
2996
+ `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
2997
+ );
2998
+ }
2999
+ const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
3000
+ return {
3001
+ score,
3002
+ verdict,
3003
+ hits,
3004
+ misses,
3005
+ expectedAspectCount: this.config.rubrics.length,
3006
+ reasoning: result.overall_reasoning,
3007
+ evaluatorRawRequest: {
3008
+ prompt
3009
+ }
3010
+ };
3011
+ }
3012
+ buildPrompt(context, rubrics) {
3013
+ const parts = [
3014
+ "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
3015
+ "",
3016
+ "[[ ## question ## ]]",
3017
+ context.evalCase.question,
3018
+ "",
3019
+ "[[ ## expected_outcome ## ]]",
3020
+ context.evalCase.expected_outcome,
3021
+ ""
3022
+ ];
3023
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
3024
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
3025
+ }
3026
+ parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
3027
+ for (const rubric of rubrics) {
3028
+ const requiredLabel = rubric.required ? " (REQUIRED)" : "";
3029
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
3030
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
3031
+ }
3032
+ parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
3033
+ return parts.join("\n");
3034
+ }
3035
+ calculateScore(result, rubrics) {
3036
+ const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
3037
+ const hits = [];
3038
+ const misses = [];
3039
+ let totalWeight = 0;
3040
+ let earnedWeight = 0;
3041
+ let failedRequired = false;
3042
+ for (const check of result.checks) {
3043
+ const rubric = rubricMap.get(check.id);
3044
+ if (!rubric) {
3045
+ continue;
3046
+ }
3047
+ totalWeight += rubric.weight;
3048
+ if (check.satisfied) {
3049
+ earnedWeight += rubric.weight;
3050
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3051
+ } else {
3052
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
3053
+ if (rubric.required) {
3054
+ failedRequired = true;
3055
+ }
3056
+ }
3057
+ }
3058
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
3059
+ let verdict;
3060
+ if (failedRequired) {
3061
+ verdict = "fail";
3062
+ } else if (score >= 0.8) {
3063
+ verdict = "pass";
3064
+ } else if (score >= 0.6) {
3065
+ verdict = "borderline";
3066
+ } else {
3067
+ verdict = "fail";
3068
+ }
3069
+ return { score, verdict, hits, misses };
3070
+ }
3071
+ };
3072
+
2872
3073
  // src/evaluation/evaluators.ts
2873
3074
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
2874
3075
 
@@ -3833,6 +4034,7 @@ async function runEvaluatorList(options) {
3833
4034
  name: evaluator.name,
3834
4035
  type: evaluator.type,
3835
4036
  score: score2.score,
4037
+ verdict: score2.verdict,
3836
4038
  hits: score2.hits,
3837
4039
  misses: score2.misses,
3838
4040
  reasoning: score2.reasoning,
@@ -3860,6 +4062,40 @@ async function runEvaluatorList(options) {
3860
4062
  name: evaluator.name,
3861
4063
  type: evaluator.type,
3862
4064
  score: score2.score,
4065
+ verdict: score2.verdict,
4066
+ hits: score2.hits,
4067
+ misses: score2.misses,
4068
+ reasoning: score2.reasoning,
4069
+ evaluator_provider_request: score2.evaluatorRawRequest
4070
+ });
4071
+ continue;
4072
+ }
4073
+ if (evaluator.type === "rubric") {
4074
+ const rubricEvaluator = new RubricEvaluator({
4075
+ config: evaluator,
4076
+ resolveJudgeProvider: async (context) => {
4077
+ if (context.judgeProvider) {
4078
+ return context.judgeProvider;
4079
+ }
4080
+ return judgeProvider;
4081
+ }
4082
+ });
4083
+ const score2 = await rubricEvaluator.evaluate({
4084
+ evalCase,
4085
+ candidate,
4086
+ target,
4087
+ provider,
4088
+ attempt,
4089
+ promptInputs,
4090
+ now,
4091
+ judgeProvider
4092
+ });
4093
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4094
+ evaluatorResults.push({
4095
+ name: evaluator.name,
4096
+ type: evaluator.type,
4097
+ score: score2.score,
4098
+ verdict: score2.verdict,
3863
4099
  hits: score2.hits,
3864
4100
  misses: score2.misses,
3865
4101
  reasoning: score2.reasoning,
@@ -4090,6 +4326,86 @@ function isTimeoutLike(error) {
4090
4326
  return value.includes("timeout");
4091
4327
  }
4092
4328
 
4329
+ // src/evaluation/generators/rubric-generator.ts
4330
+ import { generateText as generateText3 } from "ai";
4331
+ import { z as z2 } from "zod";
4332
+ var rubricItemSchema = z2.object({
4333
+ id: z2.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
4334
+ description: z2.string().describe("What this rubric checks for"),
4335
+ weight: z2.number().default(1).describe("Relative importance (default 1.0)"),
4336
+ required: z2.boolean().default(true).describe("Whether this is a mandatory requirement")
4337
+ });
4338
+ var rubricGenerationSchema = z2.object({
4339
+ rubrics: z2.array(rubricItemSchema).describe("List of evaluation rubrics")
4340
+ });
4341
+ async function generateRubrics(options) {
4342
+ const { expectedOutcome, question, referenceAnswer, provider } = options;
4343
+ const prompt = buildPrompt(expectedOutcome, question, referenceAnswer);
4344
+ const model = provider.asLanguageModel?.();
4345
+ if (!model) {
4346
+ throw new Error("Provider does not support language model interface");
4347
+ }
4348
+ const system = `You are an expert at creating evaluation rubrics.
4349
+ You must return a valid JSON object matching this schema:
4350
+ {
4351
+ "rubrics": [
4352
+ {
4353
+ "id": "string (short identifier)",
4354
+ "description": "string (what to check)",
4355
+ "weight": number (default 1.0),
4356
+ "required": boolean (default true)
4357
+ }
4358
+ ]
4359
+ }`;
4360
+ let result;
4361
+ let lastError;
4362
+ for (let attempt = 1; attempt <= 3; attempt++) {
4363
+ try {
4364
+ const { text } = await generateText3({
4365
+ model,
4366
+ system,
4367
+ prompt
4368
+ });
4369
+ const cleaned = text.replace(/```json\n?|```/g, "").trim();
4370
+ result = rubricGenerationSchema.parse(JSON.parse(cleaned));
4371
+ break;
4372
+ } catch (e) {
4373
+ lastError = e instanceof Error ? e : new Error(String(e));
4374
+ }
4375
+ }
4376
+ if (!result) {
4377
+ throw new Error(`Failed to parse generated rubrics after 3 attempts: ${lastError?.message}`);
4378
+ }
4379
+ return result.rubrics;
4380
+ }
4381
+ function buildPrompt(expectedOutcome, question, referenceAnswer) {
4382
+ const parts = [
4383
+ "You are an expert at creating evaluation rubrics.",
4384
+ "Given the expected outcome (and optionally the question and reference answer),",
4385
+ "generate a list of specific, measurable rubric items to evaluate whether an answer meets the expected outcome.",
4386
+ "",
4387
+ "Each rubric should:",
4388
+ "- Be specific and testable",
4389
+ "- Have a short, descriptive ID",
4390
+ "- Include a clear description of what to check",
4391
+ "- Indicate if it is required (mandatory) or optional",
4392
+ "- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
4393
+ "",
4394
+ "Generate 3-7 rubric items that comprehensively cover the expected outcome.",
4395
+ "",
4396
+ "[[ ## expected_outcome ## ]]",
4397
+ expectedOutcome,
4398
+ ""
4399
+ ];
4400
+ if (question && question.trim().length > 0) {
4401
+ parts.push("[[ ## question ## ]]", question, "");
4402
+ }
4403
+ if (referenceAnswer && referenceAnswer.trim().length > 0) {
4404
+ parts.push("[[ ## reference_answer ## ]]", referenceAnswer, "");
4405
+ }
4406
+ return parts.join("\n");
4407
+ }
4408
+
4093
4409
  // src/index.ts
4094
4410
  function createAgentKernel() {
4095
4411
  return { status: "stub" };
@@ -4097,6 +4413,7 @@ function createAgentKernel() {
4097
4413
  export {
4098
4414
  CodeEvaluator,
4099
4415
  LlmJudgeEvaluator,
4416
+ RubricEvaluator,
4100
4417
  TEST_MESSAGE_ROLES,
4101
4418
  buildDirectoryChain,
4102
4419
  buildPromptInputs,
@@ -4108,6 +4425,7 @@ export {
4108
4425
  extractCodeBlocks,
4109
4426
  fileExists,
4110
4427
  findGitRoot,
4428
+ generateRubrics,
4111
4429
  getHitCount,
4112
4430
  isEvaluatorKind,
4113
4431
  isGuidelineFile,