@agentv/core 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SVY324GN.js → chunk-BO7KG7JX.js} +1 -1
- package/dist/{chunk-SVY324GN.js.map → chunk-BO7KG7JX.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +4 -4
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +5 -5
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +322 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +49 -3
- package/dist/index.d.ts +49 -3
- package/dist/index.js +321 -3
- package/dist/index.js.map +1 -1
- package/package.json +2 -5
package/dist/index.cjs
CHANGED
|
@@ -32,6 +32,7 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
35
|
+
RubricEvaluator: () => RubricEvaluator,
|
|
35
36
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
36
37
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
37
38
|
buildPromptInputs: () => buildPromptInputs,
|
|
@@ -43,6 +44,7 @@ __export(index_exports, {
|
|
|
43
44
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
44
45
|
fileExists: () => fileExists2,
|
|
45
46
|
findGitRoot: () => findGitRoot,
|
|
47
|
+
generateRubrics: () => generateRubrics,
|
|
46
48
|
getHitCount: () => getHitCount,
|
|
47
49
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
48
50
|
isGuidelineFile: () => isGuidelineFile,
|
|
@@ -106,7 +108,7 @@ function isTestMessage(value) {
|
|
|
106
108
|
}
|
|
107
109
|
return candidate.content.every(isJsonObject);
|
|
108
110
|
}
|
|
109
|
-
var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
|
|
111
|
+
var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
|
|
110
112
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
111
113
|
function isEvaluatorKind(value) {
|
|
112
114
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -508,6 +510,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
508
510
|
}
|
|
509
511
|
}
|
|
510
512
|
const _model = asString2(rawEvaluator.model);
|
|
513
|
+
if (typeValue === "rubric") {
|
|
514
|
+
const rubrics = rawEvaluator.rubrics;
|
|
515
|
+
if (!Array.isArray(rubrics)) {
|
|
516
|
+
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
517
|
+
continue;
|
|
518
|
+
}
|
|
519
|
+
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
520
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
521
|
+
description: asString2(rubric.description) ?? "",
|
|
522
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
523
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
524
|
+
})).filter((r) => r.description.length > 0);
|
|
525
|
+
if (parsedRubrics.length === 0) {
|
|
526
|
+
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
527
|
+
continue;
|
|
528
|
+
}
|
|
529
|
+
evaluators.push({
|
|
530
|
+
name,
|
|
531
|
+
type: "rubric",
|
|
532
|
+
rubrics: parsedRubrics
|
|
533
|
+
});
|
|
534
|
+
continue;
|
|
535
|
+
}
|
|
511
536
|
evaluators.push({
|
|
512
537
|
name,
|
|
513
538
|
type: "llm_judge",
|
|
@@ -988,7 +1013,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
988
1013
|
continue;
|
|
989
1014
|
}
|
|
990
1015
|
const conversationId = asString5(evalcase.conversation_id);
|
|
991
|
-
const outcome = asString5(evalcase.outcome);
|
|
1016
|
+
const outcome = asString5(evalcase.expected_outcome) ?? asString5(evalcase.outcome);
|
|
992
1017
|
const inputMessagesValue = evalcase.input_messages;
|
|
993
1018
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
994
1019
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
@@ -1042,6 +1067,33 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1042
1067
|
logError(`Skipping eval case '${id}': ${message}`);
|
|
1043
1068
|
continue;
|
|
1044
1069
|
}
|
|
1070
|
+
const inlineRubrics = evalcase.rubrics;
|
|
1071
|
+
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1072
|
+
const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
|
|
1073
|
+
if (typeof rubric === "string") {
|
|
1074
|
+
return {
|
|
1075
|
+
id: `rubric-${index + 1}`,
|
|
1076
|
+
description: rubric,
|
|
1077
|
+
weight: 1,
|
|
1078
|
+
required: true
|
|
1079
|
+
};
|
|
1080
|
+
}
|
|
1081
|
+
return {
|
|
1082
|
+
id: asString5(rubric.id) ?? `rubric-${index + 1}`,
|
|
1083
|
+
description: asString5(rubric.description) ?? "",
|
|
1084
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1085
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1086
|
+
};
|
|
1087
|
+
}).filter((r) => r.description.length > 0);
|
|
1088
|
+
if (rubricItems.length > 0) {
|
|
1089
|
+
const rubricEvaluator = {
|
|
1090
|
+
name: "rubric",
|
|
1091
|
+
type: "rubric",
|
|
1092
|
+
rubrics: rubricItems
|
|
1093
|
+
};
|
|
1094
|
+
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1045
1097
|
const userFilePaths = [];
|
|
1046
1098
|
for (const segment of inputSegments) {
|
|
1047
1099
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -1251,6 +1303,9 @@ var AzureProvider = class {
|
|
|
1251
1303
|
retryConfig: this.retryConfig
|
|
1252
1304
|
});
|
|
1253
1305
|
}
|
|
1306
|
+
asLanguageModel() {
|
|
1307
|
+
return this.model;
|
|
1308
|
+
}
|
|
1254
1309
|
};
|
|
1255
1310
|
var AnthropicProvider = class {
|
|
1256
1311
|
constructor(targetName, config) {
|
|
@@ -1284,6 +1339,9 @@ var AnthropicProvider = class {
|
|
|
1284
1339
|
providerOptions
|
|
1285
1340
|
});
|
|
1286
1341
|
}
|
|
1342
|
+
asLanguageModel() {
|
|
1343
|
+
return this.model;
|
|
1344
|
+
}
|
|
1287
1345
|
};
|
|
1288
1346
|
var GeminiProvider = class {
|
|
1289
1347
|
constructor(targetName, config) {
|
|
@@ -1314,6 +1372,9 @@ var GeminiProvider = class {
|
|
|
1314
1372
|
retryConfig: this.retryConfig
|
|
1315
1373
|
});
|
|
1316
1374
|
}
|
|
1375
|
+
asLanguageModel() {
|
|
1376
|
+
return this.model;
|
|
1377
|
+
}
|
|
1317
1378
|
};
|
|
1318
1379
|
function buildAzureOptions(config) {
|
|
1319
1380
|
const options = {
|
|
@@ -3560,6 +3621,148 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
3560
3621
|
return createProvider(resolved);
|
|
3561
3622
|
}
|
|
3562
3623
|
|
|
3624
|
+
// src/evaluation/evaluators/rubric-evaluator.ts
|
|
3625
|
+
var import_ai2 = require("ai");
|
|
3626
|
+
var import_zod2 = require("zod");
|
|
3627
|
+
var rubricCheckResultSchema = import_zod2.z.object({
|
|
3628
|
+
id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
|
|
3629
|
+
satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
|
|
3630
|
+
reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
3631
|
+
});
|
|
3632
|
+
var rubricEvaluationSchema = import_zod2.z.object({
|
|
3633
|
+
checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
3634
|
+
overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
3635
|
+
});
|
|
3636
|
+
var RubricEvaluator = class {
|
|
3637
|
+
kind = "rubric";
|
|
3638
|
+
config;
|
|
3639
|
+
resolveJudgeProvider;
|
|
3640
|
+
constructor(options) {
|
|
3641
|
+
this.config = options.config;
|
|
3642
|
+
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
3643
|
+
}
|
|
3644
|
+
async evaluate(context) {
|
|
3645
|
+
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
3646
|
+
if (!judgeProvider) {
|
|
3647
|
+
throw new Error("No judge provider available for rubric evaluation");
|
|
3648
|
+
}
|
|
3649
|
+
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
3650
|
+
throw new Error(
|
|
3651
|
+
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
3652
|
+
);
|
|
3653
|
+
}
|
|
3654
|
+
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
3655
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
3656
|
+
if (!model) {
|
|
3657
|
+
throw new Error("Judge provider does not support language model interface");
|
|
3658
|
+
}
|
|
3659
|
+
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
3660
|
+
You must return a valid JSON object matching this schema:
|
|
3661
|
+
{
|
|
3662
|
+
"checks": [
|
|
3663
|
+
{
|
|
3664
|
+
"id": "string (rubric id)",
|
|
3665
|
+
"satisfied": boolean,
|
|
3666
|
+
"reasoning": "string (brief explanation)"
|
|
3667
|
+
}
|
|
3668
|
+
],
|
|
3669
|
+
"overall_reasoning": "string (summary)"
|
|
3670
|
+
}`;
|
|
3671
|
+
let result;
|
|
3672
|
+
let lastError;
|
|
3673
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
3674
|
+
try {
|
|
3675
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
3676
|
+
model,
|
|
3677
|
+
system,
|
|
3678
|
+
prompt
|
|
3679
|
+
});
|
|
3680
|
+
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
3681
|
+
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
3682
|
+
break;
|
|
3683
|
+
} catch (e) {
|
|
3684
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
3685
|
+
}
|
|
3686
|
+
}
|
|
3687
|
+
if (!result) {
|
|
3688
|
+
throw new Error(
|
|
3689
|
+
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
3690
|
+
);
|
|
3691
|
+
}
|
|
3692
|
+
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
3693
|
+
return {
|
|
3694
|
+
score,
|
|
3695
|
+
verdict,
|
|
3696
|
+
hits,
|
|
3697
|
+
misses,
|
|
3698
|
+
expectedAspectCount: this.config.rubrics.length,
|
|
3699
|
+
reasoning: result.overall_reasoning,
|
|
3700
|
+
evaluatorRawRequest: {
|
|
3701
|
+
prompt
|
|
3702
|
+
}
|
|
3703
|
+
};
|
|
3704
|
+
}
|
|
3705
|
+
buildPrompt(context, rubrics) {
|
|
3706
|
+
const parts = [
|
|
3707
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3708
|
+
"",
|
|
3709
|
+
"[[ ## question ## ]]",
|
|
3710
|
+
context.evalCase.question,
|
|
3711
|
+
"",
|
|
3712
|
+
"[[ ## expected_outcome ## ]]",
|
|
3713
|
+
context.evalCase.expected_outcome,
|
|
3714
|
+
""
|
|
3715
|
+
];
|
|
3716
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3717
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3718
|
+
}
|
|
3719
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3720
|
+
for (const rubric of rubrics) {
|
|
3721
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3722
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3723
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3724
|
+
}
|
|
3725
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3726
|
+
return parts.join("\n");
|
|
3727
|
+
}
|
|
3728
|
+
calculateScore(result, rubrics) {
|
|
3729
|
+
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
3730
|
+
const hits = [];
|
|
3731
|
+
const misses = [];
|
|
3732
|
+
let totalWeight = 0;
|
|
3733
|
+
let earnedWeight = 0;
|
|
3734
|
+
let failedRequired = false;
|
|
3735
|
+
for (const check of result.checks) {
|
|
3736
|
+
const rubric = rubricMap.get(check.id);
|
|
3737
|
+
if (!rubric) {
|
|
3738
|
+
continue;
|
|
3739
|
+
}
|
|
3740
|
+
totalWeight += rubric.weight;
|
|
3741
|
+
if (check.satisfied) {
|
|
3742
|
+
earnedWeight += rubric.weight;
|
|
3743
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3744
|
+
} else {
|
|
3745
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3746
|
+
if (rubric.required) {
|
|
3747
|
+
failedRequired = true;
|
|
3748
|
+
}
|
|
3749
|
+
}
|
|
3750
|
+
}
|
|
3751
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
3752
|
+
let verdict;
|
|
3753
|
+
if (failedRequired) {
|
|
3754
|
+
verdict = "fail";
|
|
3755
|
+
} else if (score >= 0.8) {
|
|
3756
|
+
verdict = "pass";
|
|
3757
|
+
} else if (score >= 0.6) {
|
|
3758
|
+
verdict = "borderline";
|
|
3759
|
+
} else {
|
|
3760
|
+
verdict = "fail";
|
|
3761
|
+
}
|
|
3762
|
+
return { score, verdict, hits, misses };
|
|
3763
|
+
}
|
|
3764
|
+
};
|
|
3765
|
+
|
|
3563
3766
|
// src/evaluation/evaluators.ts
|
|
3564
3767
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3565
3768
|
|
|
@@ -4534,6 +4737,7 @@ async function runEvaluatorList(options) {
|
|
|
4534
4737
|
name: evaluator.name,
|
|
4535
4738
|
type: evaluator.type,
|
|
4536
4739
|
score: score2.score,
|
|
4740
|
+
verdict: score2.verdict,
|
|
4537
4741
|
hits: score2.hits,
|
|
4538
4742
|
misses: score2.misses,
|
|
4539
4743
|
reasoning: score2.reasoning,
|
|
@@ -4561,6 +4765,40 @@ async function runEvaluatorList(options) {
|
|
|
4561
4765
|
name: evaluator.name,
|
|
4562
4766
|
type: evaluator.type,
|
|
4563
4767
|
score: score2.score,
|
|
4768
|
+
verdict: score2.verdict,
|
|
4769
|
+
hits: score2.hits,
|
|
4770
|
+
misses: score2.misses,
|
|
4771
|
+
reasoning: score2.reasoning,
|
|
4772
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4773
|
+
});
|
|
4774
|
+
continue;
|
|
4775
|
+
}
|
|
4776
|
+
if (evaluator.type === "rubric") {
|
|
4777
|
+
const rubricEvaluator = new RubricEvaluator({
|
|
4778
|
+
config: evaluator,
|
|
4779
|
+
resolveJudgeProvider: async (context) => {
|
|
4780
|
+
if (context.judgeProvider) {
|
|
4781
|
+
return context.judgeProvider;
|
|
4782
|
+
}
|
|
4783
|
+
return judgeProvider;
|
|
4784
|
+
}
|
|
4785
|
+
});
|
|
4786
|
+
const score2 = await rubricEvaluator.evaluate({
|
|
4787
|
+
evalCase,
|
|
4788
|
+
candidate,
|
|
4789
|
+
target,
|
|
4790
|
+
provider,
|
|
4791
|
+
attempt,
|
|
4792
|
+
promptInputs,
|
|
4793
|
+
now,
|
|
4794
|
+
judgeProvider
|
|
4795
|
+
});
|
|
4796
|
+
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4797
|
+
evaluatorResults.push({
|
|
4798
|
+
name: evaluator.name,
|
|
4799
|
+
type: evaluator.type,
|
|
4800
|
+
score: score2.score,
|
|
4801
|
+
verdict: score2.verdict,
|
|
4564
4802
|
hits: score2.hits,
|
|
4565
4803
|
misses: score2.misses,
|
|
4566
4804
|
reasoning: score2.reasoning,
|
|
@@ -4791,6 +5029,86 @@ function isTimeoutLike(error) {
|
|
|
4791
5029
|
return value.includes("timeout");
|
|
4792
5030
|
}
|
|
4793
5031
|
|
|
5032
|
+
// src/evaluation/generators/rubric-generator.ts
|
|
5033
|
+
var import_ai3 = require("ai");
|
|
5034
|
+
var import_zod3 = require("zod");
|
|
5035
|
+
var rubricItemSchema = import_zod3.z.object({
|
|
5036
|
+
id: import_zod3.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
5037
|
+
description: import_zod3.z.string().describe("What this rubric checks for"),
|
|
5038
|
+
weight: import_zod3.z.number().default(1).describe("Relative importance (default 1.0)"),
|
|
5039
|
+
required: import_zod3.z.boolean().default(true).describe("Whether this is a mandatory requirement")
|
|
5040
|
+
});
|
|
5041
|
+
var rubricGenerationSchema = import_zod3.z.object({
|
|
5042
|
+
rubrics: import_zod3.z.array(rubricItemSchema).describe("List of evaluation rubrics")
|
|
5043
|
+
});
|
|
5044
|
+
async function generateRubrics(options) {
|
|
5045
|
+
const { expectedOutcome, question, referenceAnswer, provider } = options;
|
|
5046
|
+
const prompt = buildPrompt(expectedOutcome, question, referenceAnswer);
|
|
5047
|
+
const model = provider.asLanguageModel?.();
|
|
5048
|
+
if (!model) {
|
|
5049
|
+
throw new Error("Provider does not support language model interface");
|
|
5050
|
+
}
|
|
5051
|
+
const system = `You are an expert at creating evaluation rubrics.
|
|
5052
|
+
You must return a valid JSON object matching this schema:
|
|
5053
|
+
{
|
|
5054
|
+
"rubrics": [
|
|
5055
|
+
{
|
|
5056
|
+
"id": "string (short identifier)",
|
|
5057
|
+
"description": "string (what to check)",
|
|
5058
|
+
"weight": number (default 1.0),
|
|
5059
|
+
"required": boolean (default true)
|
|
5060
|
+
}
|
|
5061
|
+
]
|
|
5062
|
+
}`;
|
|
5063
|
+
let result;
|
|
5064
|
+
let lastError;
|
|
5065
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
5066
|
+
try {
|
|
5067
|
+
const { text } = await (0, import_ai3.generateText)({
|
|
5068
|
+
model,
|
|
5069
|
+
system,
|
|
5070
|
+
prompt
|
|
5071
|
+
});
|
|
5072
|
+
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
5073
|
+
result = rubricGenerationSchema.parse(JSON.parse(cleaned));
|
|
5074
|
+
break;
|
|
5075
|
+
} catch (e) {
|
|
5076
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
5077
|
+
}
|
|
5078
|
+
}
|
|
5079
|
+
if (!result) {
|
|
5080
|
+
throw new Error(`Failed to parse generated rubrics after 3 attempts: ${lastError?.message}`);
|
|
5081
|
+
}
|
|
5082
|
+
return result.rubrics;
|
|
5083
|
+
}
|
|
5084
|
+
function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
5085
|
+
const parts = [
|
|
5086
|
+
"You are an expert at creating evaluation rubrics.",
|
|
5087
|
+
"Given the expected outcome (and optionally the question and reference answer),",
|
|
5088
|
+
"generate a list of specific, measurable rubric items to evaluate whether an answer meets the expected outcome.",
|
|
5089
|
+
"",
|
|
5090
|
+
"Each rubric should:",
|
|
5091
|
+
"- Be specific and testable",
|
|
5092
|
+
"- Have a short, descriptive ID",
|
|
5093
|
+
"- Include a clear description of what to check",
|
|
5094
|
+
"- Indicate if it is required (mandatory) or optional",
|
|
5095
|
+
"- Have an appropriate weight (default 1.0, use higher values for more important aspects)",
|
|
5096
|
+
"",
|
|
5097
|
+
"Generate 3-7 rubric items that comprehensively cover the expected outcome.",
|
|
5098
|
+
"",
|
|
5099
|
+
"[[ ## expected_outcome ## ]]",
|
|
5100
|
+
expectedOutcome,
|
|
5101
|
+
""
|
|
5102
|
+
];
|
|
5103
|
+
if (question && question.trim().length > 0) {
|
|
5104
|
+
parts.push("[[ ## question ## ]]", question, "");
|
|
5105
|
+
}
|
|
5106
|
+
if (referenceAnswer && referenceAnswer.trim().length > 0) {
|
|
5107
|
+
parts.push("[[ ## reference_answer ## ]]", referenceAnswer, "");
|
|
5108
|
+
}
|
|
5109
|
+
return parts.join("\n");
|
|
5110
|
+
}
|
|
5111
|
+
|
|
4794
5112
|
// src/index.ts
|
|
4795
5113
|
function createAgentKernel() {
|
|
4796
5114
|
return { status: "stub" };
|
|
@@ -4799,6 +5117,7 @@ function createAgentKernel() {
|
|
|
4799
5117
|
0 && (module.exports = {
|
|
4800
5118
|
CodeEvaluator,
|
|
4801
5119
|
LlmJudgeEvaluator,
|
|
5120
|
+
RubricEvaluator,
|
|
4802
5121
|
TEST_MESSAGE_ROLES,
|
|
4803
5122
|
buildDirectoryChain,
|
|
4804
5123
|
buildPromptInputs,
|
|
@@ -4810,6 +5129,7 @@ function createAgentKernel() {
|
|
|
4810
5129
|
extractCodeBlocks,
|
|
4811
5130
|
fileExists,
|
|
4812
5131
|
findGitRoot,
|
|
5132
|
+
generateRubrics,
|
|
4813
5133
|
getHitCount,
|
|
4814
5134
|
isEvaluatorKind,
|
|
4815
5135
|
isGuidelineFile,
|