@agentv/core 0.22.1 → 0.22.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +217 -267
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -23
- package/dist/index.d.ts +8 -23
- package/dist/index.js +217 -266
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -32,7 +32,6 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
35
|
-
RubricEvaluator: () => RubricEvaluator,
|
|
36
35
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
36
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
38
37
|
buildPromptInputs: () => buildPromptInputs,
|
|
@@ -510,25 +509,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
510
509
|
}
|
|
511
510
|
}
|
|
512
511
|
const _model = asString2(rawEvaluator.model);
|
|
512
|
+
const rawRubrics = rawEvaluator.rubrics;
|
|
513
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
514
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
515
|
+
description: asString2(rubric.description) ?? "",
|
|
516
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
517
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
518
|
+
})).filter((r) => r.description.length > 0) : void 0;
|
|
513
519
|
if (typeValue === "rubric") {
|
|
514
|
-
|
|
515
|
-
if (!Array.isArray(rubrics)) {
|
|
520
|
+
if (!parsedRubrics) {
|
|
516
521
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
517
522
|
continue;
|
|
518
523
|
}
|
|
519
|
-
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
520
|
-
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
521
|
-
description: asString2(rubric.description) ?? "",
|
|
522
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
523
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
524
|
-
})).filter((r) => r.description.length > 0);
|
|
525
524
|
if (parsedRubrics.length === 0) {
|
|
526
525
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
527
526
|
continue;
|
|
528
527
|
}
|
|
529
528
|
evaluators.push({
|
|
530
529
|
name,
|
|
531
|
-
type: "
|
|
530
|
+
type: "llm_judge",
|
|
532
531
|
rubrics: parsedRubrics
|
|
533
532
|
});
|
|
534
533
|
continue;
|
|
@@ -537,7 +536,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
537
536
|
name,
|
|
538
537
|
type: "llm_judge",
|
|
539
538
|
prompt,
|
|
540
|
-
promptPath
|
|
539
|
+
promptPath,
|
|
540
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
541
541
|
});
|
|
542
542
|
}
|
|
543
543
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -1088,7 +1088,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1088
1088
|
if (rubricItems.length > 0) {
|
|
1089
1089
|
const rubricEvaluator = {
|
|
1090
1090
|
name: "rubric",
|
|
1091
|
-
type: "
|
|
1091
|
+
type: "llm_judge",
|
|
1092
1092
|
rubrics: rubricItems
|
|
1093
1093
|
};
|
|
1094
1094
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
@@ -3621,149 +3621,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
3621
3621
|
return createProvider(resolved);
|
|
3622
3622
|
}
|
|
3623
3623
|
|
|
3624
|
-
// src/evaluation/evaluators
|
|
3624
|
+
// src/evaluation/evaluators.ts
|
|
3625
3625
|
var import_ai2 = require("ai");
|
|
3626
3626
|
var import_zod2 = require("zod");
|
|
3627
|
-
var rubricCheckResultSchema = import_zod2.z.object({
|
|
3628
|
-
id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
|
|
3629
|
-
satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
|
|
3630
|
-
reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
3631
|
-
});
|
|
3632
|
-
var rubricEvaluationSchema = import_zod2.z.object({
|
|
3633
|
-
checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
3634
|
-
overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
3635
|
-
});
|
|
3636
|
-
var RubricEvaluator = class {
|
|
3637
|
-
kind = "rubric";
|
|
3638
|
-
config;
|
|
3639
|
-
resolveJudgeProvider;
|
|
3640
|
-
constructor(options) {
|
|
3641
|
-
this.config = options.config;
|
|
3642
|
-
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
3643
|
-
}
|
|
3644
|
-
async evaluate(context) {
|
|
3645
|
-
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
3646
|
-
if (!judgeProvider) {
|
|
3647
|
-
throw new Error("No judge provider available for rubric evaluation");
|
|
3648
|
-
}
|
|
3649
|
-
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
3650
|
-
throw new Error(
|
|
3651
|
-
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
3652
|
-
);
|
|
3653
|
-
}
|
|
3654
|
-
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
3655
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
3656
|
-
if (!model) {
|
|
3657
|
-
throw new Error("Judge provider does not support language model interface");
|
|
3658
|
-
}
|
|
3659
|
-
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
3660
|
-
You must return a valid JSON object matching this schema:
|
|
3661
|
-
{
|
|
3662
|
-
"checks": [
|
|
3663
|
-
{
|
|
3664
|
-
"id": "string (rubric id)",
|
|
3665
|
-
"satisfied": boolean,
|
|
3666
|
-
"reasoning": "string (brief explanation)"
|
|
3667
|
-
}
|
|
3668
|
-
],
|
|
3669
|
-
"overall_reasoning": "string (summary)"
|
|
3670
|
-
}`;
|
|
3671
|
-
let result;
|
|
3672
|
-
let lastError;
|
|
3673
|
-
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
3674
|
-
try {
|
|
3675
|
-
const { text } = await (0, import_ai2.generateText)({
|
|
3676
|
-
model,
|
|
3677
|
-
system,
|
|
3678
|
-
prompt
|
|
3679
|
-
});
|
|
3680
|
-
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
3681
|
-
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
3682
|
-
break;
|
|
3683
|
-
} catch (e) {
|
|
3684
|
-
lastError = e instanceof Error ? e : new Error(String(e));
|
|
3685
|
-
}
|
|
3686
|
-
}
|
|
3687
|
-
if (!result) {
|
|
3688
|
-
throw new Error(
|
|
3689
|
-
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
3690
|
-
);
|
|
3691
|
-
}
|
|
3692
|
-
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
3693
|
-
return {
|
|
3694
|
-
score,
|
|
3695
|
-
verdict,
|
|
3696
|
-
hits,
|
|
3697
|
-
misses,
|
|
3698
|
-
expectedAspectCount: this.config.rubrics.length,
|
|
3699
|
-
reasoning: result.overall_reasoning,
|
|
3700
|
-
evaluatorRawRequest: {
|
|
3701
|
-
prompt
|
|
3702
|
-
}
|
|
3703
|
-
};
|
|
3704
|
-
}
|
|
3705
|
-
buildPrompt(context, rubrics) {
|
|
3706
|
-
const parts = [
|
|
3707
|
-
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3708
|
-
"",
|
|
3709
|
-
"[[ ## question ## ]]",
|
|
3710
|
-
context.evalCase.question,
|
|
3711
|
-
"",
|
|
3712
|
-
"[[ ## expected_outcome ## ]]",
|
|
3713
|
-
context.evalCase.expected_outcome,
|
|
3714
|
-
""
|
|
3715
|
-
];
|
|
3716
|
-
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3717
|
-
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3718
|
-
}
|
|
3719
|
-
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3720
|
-
for (const rubric of rubrics) {
|
|
3721
|
-
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3722
|
-
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3723
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3724
|
-
}
|
|
3725
|
-
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3726
|
-
return parts.join("\n");
|
|
3727
|
-
}
|
|
3728
|
-
calculateScore(result, rubrics) {
|
|
3729
|
-
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
3730
|
-
const hits = [];
|
|
3731
|
-
const misses = [];
|
|
3732
|
-
let totalWeight = 0;
|
|
3733
|
-
let earnedWeight = 0;
|
|
3734
|
-
let failedRequired = false;
|
|
3735
|
-
for (const check of result.checks) {
|
|
3736
|
-
const rubric = rubricMap.get(check.id);
|
|
3737
|
-
if (!rubric) {
|
|
3738
|
-
continue;
|
|
3739
|
-
}
|
|
3740
|
-
totalWeight += rubric.weight;
|
|
3741
|
-
if (check.satisfied) {
|
|
3742
|
-
earnedWeight += rubric.weight;
|
|
3743
|
-
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3744
|
-
} else {
|
|
3745
|
-
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3746
|
-
if (rubric.required) {
|
|
3747
|
-
failedRequired = true;
|
|
3748
|
-
}
|
|
3749
|
-
}
|
|
3750
|
-
}
|
|
3751
|
-
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
3752
|
-
let verdict;
|
|
3753
|
-
if (failedRequired) {
|
|
3754
|
-
verdict = "fail";
|
|
3755
|
-
} else if (score >= 0.8) {
|
|
3756
|
-
verdict = "pass";
|
|
3757
|
-
} else if (score >= 0.6) {
|
|
3758
|
-
verdict = "borderline";
|
|
3759
|
-
} else {
|
|
3760
|
-
verdict = "fail";
|
|
3761
|
-
}
|
|
3762
|
-
return { score, verdict, hits, misses };
|
|
3763
|
-
}
|
|
3764
|
-
};
|
|
3765
|
-
|
|
3766
|
-
// src/evaluation/evaluators.ts
|
|
3767
3627
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3768
3628
|
|
|
3769
3629
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -3781,6 +3641,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
3781
3641
|
|
|
3782
3642
|
[[ ## candidate_answer ## ]]
|
|
3783
3643
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
3644
|
+
var freeformEvaluationSchema = import_zod2.z.object({
|
|
3645
|
+
score: import_zod2.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
3646
|
+
hits: import_zod2.z.array(import_zod2.z.string()).describe("Brief specific achievements").optional(),
|
|
3647
|
+
misses: import_zod2.z.array(import_zod2.z.string()).describe("Brief failures or omissions").optional(),
|
|
3648
|
+
reasoning: import_zod2.z.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
3649
|
+
});
|
|
3650
|
+
var rubricCheckResultSchema = import_zod2.z.object({
|
|
3651
|
+
id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
|
|
3652
|
+
satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
|
|
3653
|
+
reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
3654
|
+
});
|
|
3655
|
+
var rubricEvaluationSchema = import_zod2.z.object({
|
|
3656
|
+
checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
3657
|
+
overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
3658
|
+
});
|
|
3784
3659
|
var LlmJudgeEvaluator = class {
|
|
3785
3660
|
kind = "llm_judge";
|
|
3786
3661
|
resolveJudgeProvider;
|
|
@@ -3798,9 +3673,13 @@ var LlmJudgeEvaluator = class {
|
|
|
3798
3673
|
if (!judgeProvider) {
|
|
3799
3674
|
throw new Error("No judge provider available for LLM grading");
|
|
3800
3675
|
}
|
|
3801
|
-
|
|
3676
|
+
const config = context.evaluator;
|
|
3677
|
+
if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
|
|
3678
|
+
return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
|
|
3679
|
+
}
|
|
3680
|
+
return this.evaluateFreeform(context, judgeProvider);
|
|
3802
3681
|
}
|
|
3803
|
-
async
|
|
3682
|
+
async evaluateFreeform(context, judgeProvider) {
|
|
3804
3683
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3805
3684
|
const variables = {
|
|
3806
3685
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
@@ -3817,34 +3696,132 @@ var LlmJudgeEvaluator = class {
|
|
|
3817
3696
|
const systemPrompt = buildOutputSchema();
|
|
3818
3697
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
3819
3698
|
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
3820
|
-
const response = await judgeProvider.invoke({
|
|
3821
|
-
question: userPrompt,
|
|
3822
|
-
systemPrompt,
|
|
3823
|
-
evalCaseId: context.evalCase.id,
|
|
3824
|
-
attempt: context.attempt,
|
|
3825
|
-
maxOutputTokens: this.maxOutputTokens,
|
|
3826
|
-
temperature: this.temperature
|
|
3827
|
-
});
|
|
3828
|
-
const parsed = parseQualityResponse(response);
|
|
3829
|
-
const score = clampScore(parsed.score ?? 0);
|
|
3830
|
-
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3831
|
-
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3832
|
-
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
3833
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3834
3699
|
const evaluatorRawRequest = {
|
|
3835
3700
|
userPrompt,
|
|
3836
3701
|
systemPrompt,
|
|
3837
3702
|
target: judgeProvider.targetName
|
|
3838
3703
|
};
|
|
3704
|
+
try {
|
|
3705
|
+
const { data, providerResponse } = await this.runWithRetry({
|
|
3706
|
+
context,
|
|
3707
|
+
judgeProvider,
|
|
3708
|
+
systemPrompt,
|
|
3709
|
+
userPrompt,
|
|
3710
|
+
schema: freeformEvaluationSchema
|
|
3711
|
+
});
|
|
3712
|
+
const score = clampScore(data.score);
|
|
3713
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3714
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3715
|
+
const reasoning = data.reasoning ?? providerResponse?.reasoning;
|
|
3716
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3717
|
+
return {
|
|
3718
|
+
score,
|
|
3719
|
+
verdict: scoreToVerdict(score),
|
|
3720
|
+
hits,
|
|
3721
|
+
misses,
|
|
3722
|
+
expectedAspectCount,
|
|
3723
|
+
reasoning,
|
|
3724
|
+
evaluatorRawRequest
|
|
3725
|
+
};
|
|
3726
|
+
} catch {
|
|
3727
|
+
return {
|
|
3728
|
+
score: 0,
|
|
3729
|
+
verdict: "fail",
|
|
3730
|
+
hits: [],
|
|
3731
|
+
misses: [],
|
|
3732
|
+
expectedAspectCount: 1,
|
|
3733
|
+
evaluatorRawRequest
|
|
3734
|
+
};
|
|
3735
|
+
}
|
|
3736
|
+
}
|
|
3737
|
+
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
3738
|
+
if (!rubrics || rubrics.length === 0) {
|
|
3739
|
+
throw new Error(
|
|
3740
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
3741
|
+
);
|
|
3742
|
+
}
|
|
3743
|
+
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
3744
|
+
const systemPrompt = buildRubricOutputSchema();
|
|
3745
|
+
const evaluatorRawRequest = {
|
|
3746
|
+
userPrompt: prompt,
|
|
3747
|
+
systemPrompt,
|
|
3748
|
+
target: judgeProvider.targetName
|
|
3749
|
+
};
|
|
3750
|
+
const { data } = await this.runWithRetry({
|
|
3751
|
+
context,
|
|
3752
|
+
judgeProvider,
|
|
3753
|
+
systemPrompt,
|
|
3754
|
+
userPrompt: prompt,
|
|
3755
|
+
schema: rubricEvaluationSchema
|
|
3756
|
+
});
|
|
3757
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
3839
3758
|
return {
|
|
3840
3759
|
score,
|
|
3760
|
+
verdict,
|
|
3841
3761
|
hits,
|
|
3842
3762
|
misses,
|
|
3843
|
-
expectedAspectCount,
|
|
3844
|
-
reasoning,
|
|
3763
|
+
expectedAspectCount: rubrics.length,
|
|
3764
|
+
reasoning: data.overall_reasoning,
|
|
3845
3765
|
evaluatorRawRequest
|
|
3846
3766
|
};
|
|
3847
3767
|
}
|
|
3768
|
+
buildRubricPrompt(context, rubrics) {
|
|
3769
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3770
|
+
const parts = [
|
|
3771
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3772
|
+
"",
|
|
3773
|
+
"[[ ## question ## ]]",
|
|
3774
|
+
formattedQuestion,
|
|
3775
|
+
"",
|
|
3776
|
+
"[[ ## expected_outcome ## ]]",
|
|
3777
|
+
context.evalCase.expected_outcome,
|
|
3778
|
+
""
|
|
3779
|
+
];
|
|
3780
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3781
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3782
|
+
}
|
|
3783
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3784
|
+
for (const rubric of rubrics) {
|
|
3785
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3786
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3787
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3788
|
+
}
|
|
3789
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3790
|
+
return parts.join("\n");
|
|
3791
|
+
}
|
|
3792
|
+
async runWithRetry(options) {
|
|
3793
|
+
const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
|
|
3794
|
+
let lastError;
|
|
3795
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
3796
|
+
try {
|
|
3797
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
3798
|
+
if (model) {
|
|
3799
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
3800
|
+
model,
|
|
3801
|
+
system: systemPrompt,
|
|
3802
|
+
prompt: userPrompt,
|
|
3803
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
3804
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
3805
|
+
});
|
|
3806
|
+
const data2 = schema.parse(parseJsonFromText(text));
|
|
3807
|
+
return { data: data2 };
|
|
3808
|
+
}
|
|
3809
|
+
const response = await judgeProvider.invoke({
|
|
3810
|
+
question: userPrompt,
|
|
3811
|
+
systemPrompt,
|
|
3812
|
+
evalCaseId: context.evalCase.id,
|
|
3813
|
+
attempt: context.attempt,
|
|
3814
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
3815
|
+
temperature: this.temperature
|
|
3816
|
+
});
|
|
3817
|
+
const data = schema.parse(parseJsonFromText(response.text ?? ""));
|
|
3818
|
+
return { data, providerResponse: response };
|
|
3819
|
+
} catch (e) {
|
|
3820
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
3821
|
+
}
|
|
3822
|
+
}
|
|
3823
|
+
throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
|
|
3824
|
+
}
|
|
3848
3825
|
};
|
|
3849
3826
|
function buildOutputSchema() {
|
|
3850
3827
|
return [
|
|
@@ -3858,6 +3835,29 @@ function buildOutputSchema() {
|
|
|
3858
3835
|
"}"
|
|
3859
3836
|
].join("\n");
|
|
3860
3837
|
}
|
|
3838
|
+
function buildRubricOutputSchema() {
|
|
3839
|
+
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
3840
|
+
You must return a valid JSON object matching this schema:
|
|
3841
|
+
{
|
|
3842
|
+
"checks": [
|
|
3843
|
+
{
|
|
3844
|
+
"id": "string (rubric id)",
|
|
3845
|
+
"satisfied": boolean,
|
|
3846
|
+
"reasoning": "string (brief explanation)"
|
|
3847
|
+
}
|
|
3848
|
+
],
|
|
3849
|
+
"overall_reasoning": "string (summary)"
|
|
3850
|
+
}`;
|
|
3851
|
+
}
|
|
3852
|
+
function scoreToVerdict(score) {
|
|
3853
|
+
if (score >= 0.8) {
|
|
3854
|
+
return "pass";
|
|
3855
|
+
}
|
|
3856
|
+
if (score >= 0.6) {
|
|
3857
|
+
return "borderline";
|
|
3858
|
+
}
|
|
3859
|
+
return "fail";
|
|
3860
|
+
}
|
|
3861
3861
|
function clampScore(value) {
|
|
3862
3862
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
3863
3863
|
return 0;
|
|
@@ -3870,71 +3870,15 @@ function clampScore(value) {
|
|
|
3870
3870
|
}
|
|
3871
3871
|
return value;
|
|
3872
3872
|
}
|
|
3873
|
-
function parseQualityResponse(response) {
|
|
3874
|
-
const text = typeof response.text === "string" ? response.text.trim() : "";
|
|
3875
|
-
if (text.length === 0) {
|
|
3876
|
-
return {};
|
|
3877
|
-
}
|
|
3878
|
-
const direct = attemptParseJson(text);
|
|
3879
|
-
if (direct && validateQualityJson(direct)) {
|
|
3880
|
-
return direct;
|
|
3881
|
-
}
|
|
3882
|
-
const extracted = extractJsonBlob(text);
|
|
3883
|
-
if (extracted) {
|
|
3884
|
-
const parsed = attemptParseJson(extracted);
|
|
3885
|
-
if (parsed && validateQualityJson(parsed)) {
|
|
3886
|
-
return parsed;
|
|
3887
|
-
}
|
|
3888
|
-
}
|
|
3889
|
-
return {};
|
|
3890
|
-
}
|
|
3891
|
-
function attemptParseJson(text) {
|
|
3892
|
-
try {
|
|
3893
|
-
const parsed = JSON.parse(text);
|
|
3894
|
-
const score = typeof parsed.score === "number" ? parsed.score : void 0;
|
|
3895
|
-
const hits = parsed.hits;
|
|
3896
|
-
const misses = parsed.misses;
|
|
3897
|
-
const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3898
|
-
return { score, hits, misses, reasoning };
|
|
3899
|
-
} catch {
|
|
3900
|
-
return void 0;
|
|
3901
|
-
}
|
|
3902
|
-
}
|
|
3903
|
-
function validateQualityJson(parsed) {
|
|
3904
|
-
if (typeof parsed.score !== "number") {
|
|
3905
|
-
return false;
|
|
3906
|
-
}
|
|
3907
|
-
if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
|
|
3908
|
-
return false;
|
|
3909
|
-
}
|
|
3910
|
-
if (parsed.score < 0 || parsed.score > 1) {
|
|
3911
|
-
return false;
|
|
3912
|
-
}
|
|
3913
|
-
if (parsed.hits !== void 0) {
|
|
3914
|
-
if (!Array.isArray(parsed.hits)) {
|
|
3915
|
-
return false;
|
|
3916
|
-
}
|
|
3917
|
-
if (!parsed.hits.every((item) => typeof item === "string")) {
|
|
3918
|
-
return false;
|
|
3919
|
-
}
|
|
3920
|
-
}
|
|
3921
|
-
if (parsed.misses !== void 0) {
|
|
3922
|
-
if (!Array.isArray(parsed.misses)) {
|
|
3923
|
-
return false;
|
|
3924
|
-
}
|
|
3925
|
-
if (!parsed.misses.every((item) => typeof item === "string")) {
|
|
3926
|
-
return false;
|
|
3927
|
-
}
|
|
3928
|
-
}
|
|
3929
|
-
if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
|
|
3930
|
-
return false;
|
|
3931
|
-
}
|
|
3932
|
-
return true;
|
|
3933
|
-
}
|
|
3934
3873
|
function extractJsonBlob(text) {
|
|
3935
3874
|
const match = text.match(/\{[\s\S]*\}/);
|
|
3936
3875
|
return match?.[0];
|
|
3937
3876
|
}
|
|
3877
|
+
function parseJsonFromText(text) {
|
|
3878
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
3879
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
3880
|
+
return JSON.parse(blob);
|
|
3881
|
+
}
|
|
3938
3882
|
function isNonEmptyString(value) {
|
|
3939
3883
|
return typeof value === "string" && value.trim().length > 0;
|
|
3940
3884
|
}
|
|
@@ -3971,6 +3915,7 @@ var CodeEvaluator = class {
|
|
|
3971
3915
|
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3972
3916
|
return {
|
|
3973
3917
|
score,
|
|
3918
|
+
verdict: scoreToVerdict(score),
|
|
3974
3919
|
hits,
|
|
3975
3920
|
misses,
|
|
3976
3921
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
@@ -3984,6 +3929,7 @@ var CodeEvaluator = class {
|
|
|
3984
3929
|
const message = error instanceof Error ? error.message : String(error);
|
|
3985
3930
|
return {
|
|
3986
3931
|
score: 0,
|
|
3932
|
+
verdict: "fail",
|
|
3987
3933
|
hits: [],
|
|
3988
3934
|
misses: [`Code evaluator failed: ${message}`],
|
|
3989
3935
|
expectedAspectCount: 1,
|
|
@@ -3997,6 +3943,33 @@ var CodeEvaluator = class {
|
|
|
3997
3943
|
}
|
|
3998
3944
|
}
|
|
3999
3945
|
};
|
|
3946
|
+
function calculateRubricScore(result, rubrics) {
|
|
3947
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
3948
|
+
const hits = [];
|
|
3949
|
+
const misses = [];
|
|
3950
|
+
let totalWeight = 0;
|
|
3951
|
+
let earnedWeight = 0;
|
|
3952
|
+
let failedRequired = false;
|
|
3953
|
+
for (const check of result.checks) {
|
|
3954
|
+
const rubric = rubricMap.get(check.id);
|
|
3955
|
+
if (!rubric) {
|
|
3956
|
+
continue;
|
|
3957
|
+
}
|
|
3958
|
+
totalWeight += rubric.weight;
|
|
3959
|
+
if (check.satisfied) {
|
|
3960
|
+
earnedWeight += rubric.weight;
|
|
3961
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3962
|
+
} else {
|
|
3963
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3964
|
+
if (rubric.required) {
|
|
3965
|
+
failedRequired = true;
|
|
3966
|
+
}
|
|
3967
|
+
}
|
|
3968
|
+
}
|
|
3969
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
3970
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
3971
|
+
return { score, verdict, hits, misses };
|
|
3972
|
+
}
|
|
4000
3973
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
4001
3974
|
const { spawn: spawn2 } = await import("child_process");
|
|
4002
3975
|
return await new Promise((resolve, reject) => {
|
|
@@ -4743,7 +4716,6 @@ async function runEvaluatorList(options) {
|
|
|
4743
4716
|
reasoning: score2.reasoning,
|
|
4744
4717
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4745
4718
|
});
|
|
4746
|
-
continue;
|
|
4747
4719
|
}
|
|
4748
4720
|
if (evaluator.type === "code") {
|
|
4749
4721
|
const codeEvaluator = new CodeEvaluator({
|
|
@@ -4771,44 +4743,12 @@ async function runEvaluatorList(options) {
|
|
|
4771
4743
|
reasoning: score2.reasoning,
|
|
4772
4744
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4773
4745
|
});
|
|
4774
|
-
continue;
|
|
4775
|
-
}
|
|
4776
|
-
if (evaluator.type === "rubric") {
|
|
4777
|
-
const rubricEvaluator = new RubricEvaluator({
|
|
4778
|
-
config: evaluator,
|
|
4779
|
-
resolveJudgeProvider: async (context) => {
|
|
4780
|
-
if (context.judgeProvider) {
|
|
4781
|
-
return context.judgeProvider;
|
|
4782
|
-
}
|
|
4783
|
-
return judgeProvider;
|
|
4784
|
-
}
|
|
4785
|
-
});
|
|
4786
|
-
const score2 = await rubricEvaluator.evaluate({
|
|
4787
|
-
evalCase,
|
|
4788
|
-
candidate,
|
|
4789
|
-
target,
|
|
4790
|
-
provider,
|
|
4791
|
-
attempt,
|
|
4792
|
-
promptInputs,
|
|
4793
|
-
now,
|
|
4794
|
-
judgeProvider
|
|
4795
|
-
});
|
|
4796
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4797
|
-
evaluatorResults.push({
|
|
4798
|
-
name: evaluator.name,
|
|
4799
|
-
type: evaluator.type,
|
|
4800
|
-
score: score2.score,
|
|
4801
|
-
verdict: score2.verdict,
|
|
4802
|
-
hits: score2.hits,
|
|
4803
|
-
misses: score2.misses,
|
|
4804
|
-
reasoning: score2.reasoning,
|
|
4805
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4806
|
-
});
|
|
4807
4746
|
}
|
|
4808
4747
|
} catch (error) {
|
|
4809
4748
|
const message = error instanceof Error ? error.message : String(error);
|
|
4810
4749
|
const fallbackScore = {
|
|
4811
4750
|
score: 0,
|
|
4751
|
+
verdict: "fail",
|
|
4812
4752
|
hits: [],
|
|
4813
4753
|
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
4814
4754
|
expectedAspectCount: 1,
|
|
@@ -4823,6 +4763,7 @@ async function runEvaluatorList(options) {
|
|
|
4823
4763
|
name: evaluator.name ?? "unknown",
|
|
4824
4764
|
type: evaluator.type ?? "unknown",
|
|
4825
4765
|
score: 0,
|
|
4766
|
+
verdict: "fail",
|
|
4826
4767
|
hits: [],
|
|
4827
4768
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
4828
4769
|
reasoning: message
|
|
@@ -4841,6 +4782,7 @@ async function runEvaluatorList(options) {
|
|
|
4841
4782
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
4842
4783
|
const score = {
|
|
4843
4784
|
score: aggregateScore,
|
|
4785
|
+
verdict: scoreToVerdict2(aggregateScore),
|
|
4844
4786
|
hits,
|
|
4845
4787
|
misses,
|
|
4846
4788
|
expectedAspectCount,
|
|
@@ -4891,6 +4833,15 @@ async function resolveCustomPrompt(config) {
|
|
|
4891
4833
|
function isNonEmptyString2(value) {
|
|
4892
4834
|
return typeof value === "string" && value.trim().length > 0;
|
|
4893
4835
|
}
|
|
4836
|
+
function scoreToVerdict2(score) {
|
|
4837
|
+
if (score >= 0.8) {
|
|
4838
|
+
return "pass";
|
|
4839
|
+
}
|
|
4840
|
+
if (score >= 0.6) {
|
|
4841
|
+
return "borderline";
|
|
4842
|
+
}
|
|
4843
|
+
return "fail";
|
|
4844
|
+
}
|
|
4894
4845
|
function filterEvalCases(evalCases, evalId) {
|
|
4895
4846
|
if (!evalId) {
|
|
4896
4847
|
return evalCases;
|
|
@@ -5117,7 +5068,6 @@ function createAgentKernel() {
|
|
|
5117
5068
|
0 && (module.exports = {
|
|
5118
5069
|
CodeEvaluator,
|
|
5119
5070
|
LlmJudgeEvaluator,
|
|
5120
|
-
RubricEvaluator,
|
|
5121
5071
|
TEST_MESSAGE_ROLES,
|
|
5122
5072
|
buildDirectoryChain,
|
|
5123
5073
|
buildPromptInputs,
|