@agentv/core 0.22.0 → 0.22.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-BO7KG7JX.js → chunk-B2J23S7D.js} +1 -1
- package/dist/{chunk-BO7KG7JX.js.map → chunk-B2J23S7D.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +38 -24
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +4 -1
- package/dist/evaluation/validation/index.d.ts +4 -1
- package/dist/evaluation/validation/index.js +27 -13
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +217 -267
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -23
- package/dist/index.d.ts +8 -23
- package/dist/index.js +218 -267
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
package/dist/index.js
CHANGED
|
@@ -8,7 +8,7 @@ import {
|
|
|
8
8
|
readTextFile,
|
|
9
9
|
resolveFileReference,
|
|
10
10
|
resolveTargetDefinition
|
|
11
|
-
} from "./chunk-
|
|
11
|
+
} from "./chunk-B2J23S7D.js";
|
|
12
12
|
|
|
13
13
|
// src/evaluation/types.ts
|
|
14
14
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -453,25 +453,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
453
453
|
}
|
|
454
454
|
}
|
|
455
455
|
const _model = asString2(rawEvaluator.model);
|
|
456
|
+
const rawRubrics = rawEvaluator.rubrics;
|
|
457
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
458
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
459
|
+
description: asString2(rubric.description) ?? "",
|
|
460
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
461
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
462
|
+
})).filter((r) => r.description.length > 0) : void 0;
|
|
456
463
|
if (typeValue === "rubric") {
|
|
457
|
-
|
|
458
|
-
if (!Array.isArray(rubrics)) {
|
|
464
|
+
if (!parsedRubrics) {
|
|
459
465
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
460
466
|
continue;
|
|
461
467
|
}
|
|
462
|
-
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
463
|
-
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
464
|
-
description: asString2(rubric.description) ?? "",
|
|
465
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
466
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
467
|
-
})).filter((r) => r.description.length > 0);
|
|
468
468
|
if (parsedRubrics.length === 0) {
|
|
469
469
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
470
470
|
continue;
|
|
471
471
|
}
|
|
472
472
|
evaluators.push({
|
|
473
473
|
name,
|
|
474
|
-
type: "
|
|
474
|
+
type: "llm_judge",
|
|
475
475
|
rubrics: parsedRubrics
|
|
476
476
|
});
|
|
477
477
|
continue;
|
|
@@ -480,7 +480,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
480
480
|
name,
|
|
481
481
|
type: "llm_judge",
|
|
482
482
|
prompt,
|
|
483
|
-
promptPath
|
|
483
|
+
promptPath,
|
|
484
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
484
485
|
});
|
|
485
486
|
}
|
|
486
487
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -1031,7 +1032,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1031
1032
|
if (rubricItems.length > 0) {
|
|
1032
1033
|
const rubricEvaluator = {
|
|
1033
1034
|
name: "rubric",
|
|
1034
|
-
type: "
|
|
1035
|
+
type: "llm_judge",
|
|
1035
1036
|
rubrics: rubricItems
|
|
1036
1037
|
};
|
|
1037
1038
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
@@ -2928,149 +2929,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2928
2929
|
return createProvider(resolved);
|
|
2929
2930
|
}
|
|
2930
2931
|
|
|
2931
|
-
// src/evaluation/evaluators
|
|
2932
|
+
// src/evaluation/evaluators.ts
|
|
2932
2933
|
import { generateText as generateText2 } from "ai";
|
|
2933
2934
|
import { z } from "zod";
|
|
2934
|
-
var rubricCheckResultSchema = z.object({
|
|
2935
|
-
id: z.string().describe("The ID of the rubric item being checked"),
|
|
2936
|
-
satisfied: z.boolean().describe("Whether this rubric requirement is met"),
|
|
2937
|
-
reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
2938
|
-
});
|
|
2939
|
-
var rubricEvaluationSchema = z.object({
|
|
2940
|
-
checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
2941
|
-
overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
2942
|
-
});
|
|
2943
|
-
var RubricEvaluator = class {
|
|
2944
|
-
kind = "rubric";
|
|
2945
|
-
config;
|
|
2946
|
-
resolveJudgeProvider;
|
|
2947
|
-
constructor(options) {
|
|
2948
|
-
this.config = options.config;
|
|
2949
|
-
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
2950
|
-
}
|
|
2951
|
-
async evaluate(context) {
|
|
2952
|
-
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
2953
|
-
if (!judgeProvider) {
|
|
2954
|
-
throw new Error("No judge provider available for rubric evaluation");
|
|
2955
|
-
}
|
|
2956
|
-
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
2957
|
-
throw new Error(
|
|
2958
|
-
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
2959
|
-
);
|
|
2960
|
-
}
|
|
2961
|
-
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
2962
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
2963
|
-
if (!model) {
|
|
2964
|
-
throw new Error("Judge provider does not support language model interface");
|
|
2965
|
-
}
|
|
2966
|
-
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
2967
|
-
You must return a valid JSON object matching this schema:
|
|
2968
|
-
{
|
|
2969
|
-
"checks": [
|
|
2970
|
-
{
|
|
2971
|
-
"id": "string (rubric id)",
|
|
2972
|
-
"satisfied": boolean,
|
|
2973
|
-
"reasoning": "string (brief explanation)"
|
|
2974
|
-
}
|
|
2975
|
-
],
|
|
2976
|
-
"overall_reasoning": "string (summary)"
|
|
2977
|
-
}`;
|
|
2978
|
-
let result;
|
|
2979
|
-
let lastError;
|
|
2980
|
-
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
2981
|
-
try {
|
|
2982
|
-
const { text } = await generateText2({
|
|
2983
|
-
model,
|
|
2984
|
-
system,
|
|
2985
|
-
prompt
|
|
2986
|
-
});
|
|
2987
|
-
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
2988
|
-
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
2989
|
-
break;
|
|
2990
|
-
} catch (e) {
|
|
2991
|
-
lastError = e instanceof Error ? e : new Error(String(e));
|
|
2992
|
-
}
|
|
2993
|
-
}
|
|
2994
|
-
if (!result) {
|
|
2995
|
-
throw new Error(
|
|
2996
|
-
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
2997
|
-
);
|
|
2998
|
-
}
|
|
2999
|
-
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
3000
|
-
return {
|
|
3001
|
-
score,
|
|
3002
|
-
verdict,
|
|
3003
|
-
hits,
|
|
3004
|
-
misses,
|
|
3005
|
-
expectedAspectCount: this.config.rubrics.length,
|
|
3006
|
-
reasoning: result.overall_reasoning,
|
|
3007
|
-
evaluatorRawRequest: {
|
|
3008
|
-
prompt
|
|
3009
|
-
}
|
|
3010
|
-
};
|
|
3011
|
-
}
|
|
3012
|
-
buildPrompt(context, rubrics) {
|
|
3013
|
-
const parts = [
|
|
3014
|
-
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3015
|
-
"",
|
|
3016
|
-
"[[ ## question ## ]]",
|
|
3017
|
-
context.evalCase.question,
|
|
3018
|
-
"",
|
|
3019
|
-
"[[ ## expected_outcome ## ]]",
|
|
3020
|
-
context.evalCase.expected_outcome,
|
|
3021
|
-
""
|
|
3022
|
-
];
|
|
3023
|
-
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3024
|
-
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3025
|
-
}
|
|
3026
|
-
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3027
|
-
for (const rubric of rubrics) {
|
|
3028
|
-
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3029
|
-
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3030
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3031
|
-
}
|
|
3032
|
-
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3033
|
-
return parts.join("\n");
|
|
3034
|
-
}
|
|
3035
|
-
calculateScore(result, rubrics) {
|
|
3036
|
-
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
3037
|
-
const hits = [];
|
|
3038
|
-
const misses = [];
|
|
3039
|
-
let totalWeight = 0;
|
|
3040
|
-
let earnedWeight = 0;
|
|
3041
|
-
let failedRequired = false;
|
|
3042
|
-
for (const check of result.checks) {
|
|
3043
|
-
const rubric = rubricMap.get(check.id);
|
|
3044
|
-
if (!rubric) {
|
|
3045
|
-
continue;
|
|
3046
|
-
}
|
|
3047
|
-
totalWeight += rubric.weight;
|
|
3048
|
-
if (check.satisfied) {
|
|
3049
|
-
earnedWeight += rubric.weight;
|
|
3050
|
-
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3051
|
-
} else {
|
|
3052
|
-
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3053
|
-
if (rubric.required) {
|
|
3054
|
-
failedRequired = true;
|
|
3055
|
-
}
|
|
3056
|
-
}
|
|
3057
|
-
}
|
|
3058
|
-
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
3059
|
-
let verdict;
|
|
3060
|
-
if (failedRequired) {
|
|
3061
|
-
verdict = "fail";
|
|
3062
|
-
} else if (score >= 0.8) {
|
|
3063
|
-
verdict = "pass";
|
|
3064
|
-
} else if (score >= 0.6) {
|
|
3065
|
-
verdict = "borderline";
|
|
3066
|
-
} else {
|
|
3067
|
-
verdict = "fail";
|
|
3068
|
-
}
|
|
3069
|
-
return { score, verdict, hits, misses };
|
|
3070
|
-
}
|
|
3071
|
-
};
|
|
3072
|
-
|
|
3073
|
-
// src/evaluation/evaluators.ts
|
|
3074
2935
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3075
2936
|
|
|
3076
2937
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -3088,6 +2949,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
3088
2949
|
|
|
3089
2950
|
[[ ## candidate_answer ## ]]
|
|
3090
2951
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
2952
|
+
var freeformEvaluationSchema = z.object({
|
|
2953
|
+
score: z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
2954
|
+
hits: z.array(z.string()).describe("Brief specific achievements").optional(),
|
|
2955
|
+
misses: z.array(z.string()).describe("Brief failures or omissions").optional(),
|
|
2956
|
+
reasoning: z.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
2957
|
+
});
|
|
2958
|
+
var rubricCheckResultSchema = z.object({
|
|
2959
|
+
id: z.string().describe("The ID of the rubric item being checked"),
|
|
2960
|
+
satisfied: z.boolean().describe("Whether this rubric requirement is met"),
|
|
2961
|
+
reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
2962
|
+
});
|
|
2963
|
+
var rubricEvaluationSchema = z.object({
|
|
2964
|
+
checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
2965
|
+
overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
2966
|
+
});
|
|
3091
2967
|
var LlmJudgeEvaluator = class {
|
|
3092
2968
|
kind = "llm_judge";
|
|
3093
2969
|
resolveJudgeProvider;
|
|
@@ -3105,9 +2981,13 @@ var LlmJudgeEvaluator = class {
|
|
|
3105
2981
|
if (!judgeProvider) {
|
|
3106
2982
|
throw new Error("No judge provider available for LLM grading");
|
|
3107
2983
|
}
|
|
3108
|
-
|
|
2984
|
+
const config = context.evaluator;
|
|
2985
|
+
if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
|
|
2986
|
+
return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
|
|
2987
|
+
}
|
|
2988
|
+
return this.evaluateFreeform(context, judgeProvider);
|
|
3109
2989
|
}
|
|
3110
|
-
async
|
|
2990
|
+
async evaluateFreeform(context, judgeProvider) {
|
|
3111
2991
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3112
2992
|
const variables = {
|
|
3113
2993
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
@@ -3124,34 +3004,132 @@ var LlmJudgeEvaluator = class {
|
|
|
3124
3004
|
const systemPrompt = buildOutputSchema();
|
|
3125
3005
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
3126
3006
|
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
3127
|
-
const response = await judgeProvider.invoke({
|
|
3128
|
-
question: userPrompt,
|
|
3129
|
-
systemPrompt,
|
|
3130
|
-
evalCaseId: context.evalCase.id,
|
|
3131
|
-
attempt: context.attempt,
|
|
3132
|
-
maxOutputTokens: this.maxOutputTokens,
|
|
3133
|
-
temperature: this.temperature
|
|
3134
|
-
});
|
|
3135
|
-
const parsed = parseQualityResponse(response);
|
|
3136
|
-
const score = clampScore(parsed.score ?? 0);
|
|
3137
|
-
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3138
|
-
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3139
|
-
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
3140
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3141
3007
|
const evaluatorRawRequest = {
|
|
3142
3008
|
userPrompt,
|
|
3143
3009
|
systemPrompt,
|
|
3144
3010
|
target: judgeProvider.targetName
|
|
3145
3011
|
};
|
|
3012
|
+
try {
|
|
3013
|
+
const { data, providerResponse } = await this.runWithRetry({
|
|
3014
|
+
context,
|
|
3015
|
+
judgeProvider,
|
|
3016
|
+
systemPrompt,
|
|
3017
|
+
userPrompt,
|
|
3018
|
+
schema: freeformEvaluationSchema
|
|
3019
|
+
});
|
|
3020
|
+
const score = clampScore(data.score);
|
|
3021
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3022
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3023
|
+
const reasoning = data.reasoning ?? providerResponse?.reasoning;
|
|
3024
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3025
|
+
return {
|
|
3026
|
+
score,
|
|
3027
|
+
verdict: scoreToVerdict(score),
|
|
3028
|
+
hits,
|
|
3029
|
+
misses,
|
|
3030
|
+
expectedAspectCount,
|
|
3031
|
+
reasoning,
|
|
3032
|
+
evaluatorRawRequest
|
|
3033
|
+
};
|
|
3034
|
+
} catch {
|
|
3035
|
+
return {
|
|
3036
|
+
score: 0,
|
|
3037
|
+
verdict: "fail",
|
|
3038
|
+
hits: [],
|
|
3039
|
+
misses: [],
|
|
3040
|
+
expectedAspectCount: 1,
|
|
3041
|
+
evaluatorRawRequest
|
|
3042
|
+
};
|
|
3043
|
+
}
|
|
3044
|
+
}
|
|
3045
|
+
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
3046
|
+
if (!rubrics || rubrics.length === 0) {
|
|
3047
|
+
throw new Error(
|
|
3048
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
3049
|
+
);
|
|
3050
|
+
}
|
|
3051
|
+
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
3052
|
+
const systemPrompt = buildRubricOutputSchema();
|
|
3053
|
+
const evaluatorRawRequest = {
|
|
3054
|
+
userPrompt: prompt,
|
|
3055
|
+
systemPrompt,
|
|
3056
|
+
target: judgeProvider.targetName
|
|
3057
|
+
};
|
|
3058
|
+
const { data } = await this.runWithRetry({
|
|
3059
|
+
context,
|
|
3060
|
+
judgeProvider,
|
|
3061
|
+
systemPrompt,
|
|
3062
|
+
userPrompt: prompt,
|
|
3063
|
+
schema: rubricEvaluationSchema
|
|
3064
|
+
});
|
|
3065
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
3146
3066
|
return {
|
|
3147
3067
|
score,
|
|
3068
|
+
verdict,
|
|
3148
3069
|
hits,
|
|
3149
3070
|
misses,
|
|
3150
|
-
expectedAspectCount,
|
|
3151
|
-
reasoning,
|
|
3071
|
+
expectedAspectCount: rubrics.length,
|
|
3072
|
+
reasoning: data.overall_reasoning,
|
|
3152
3073
|
evaluatorRawRequest
|
|
3153
3074
|
};
|
|
3154
3075
|
}
|
|
3076
|
+
buildRubricPrompt(context, rubrics) {
|
|
3077
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3078
|
+
const parts = [
|
|
3079
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3080
|
+
"",
|
|
3081
|
+
"[[ ## question ## ]]",
|
|
3082
|
+
formattedQuestion,
|
|
3083
|
+
"",
|
|
3084
|
+
"[[ ## expected_outcome ## ]]",
|
|
3085
|
+
context.evalCase.expected_outcome,
|
|
3086
|
+
""
|
|
3087
|
+
];
|
|
3088
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3089
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3090
|
+
}
|
|
3091
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3092
|
+
for (const rubric of rubrics) {
|
|
3093
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3094
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3095
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3096
|
+
}
|
|
3097
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3098
|
+
return parts.join("\n");
|
|
3099
|
+
}
|
|
3100
|
+
async runWithRetry(options) {
|
|
3101
|
+
const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
|
|
3102
|
+
let lastError;
|
|
3103
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
3104
|
+
try {
|
|
3105
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
3106
|
+
if (model) {
|
|
3107
|
+
const { text } = await generateText2({
|
|
3108
|
+
model,
|
|
3109
|
+
system: systemPrompt,
|
|
3110
|
+
prompt: userPrompt,
|
|
3111
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
3112
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
3113
|
+
});
|
|
3114
|
+
const data2 = schema.parse(parseJsonFromText(text));
|
|
3115
|
+
return { data: data2 };
|
|
3116
|
+
}
|
|
3117
|
+
const response = await judgeProvider.invoke({
|
|
3118
|
+
question: userPrompt,
|
|
3119
|
+
systemPrompt,
|
|
3120
|
+
evalCaseId: context.evalCase.id,
|
|
3121
|
+
attempt: context.attempt,
|
|
3122
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
3123
|
+
temperature: this.temperature
|
|
3124
|
+
});
|
|
3125
|
+
const data = schema.parse(parseJsonFromText(response.text ?? ""));
|
|
3126
|
+
return { data, providerResponse: response };
|
|
3127
|
+
} catch (e) {
|
|
3128
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
3129
|
+
}
|
|
3130
|
+
}
|
|
3131
|
+
throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
|
|
3132
|
+
}
|
|
3155
3133
|
};
|
|
3156
3134
|
function buildOutputSchema() {
|
|
3157
3135
|
return [
|
|
@@ -3165,6 +3143,29 @@ function buildOutputSchema() {
|
|
|
3165
3143
|
"}"
|
|
3166
3144
|
].join("\n");
|
|
3167
3145
|
}
|
|
3146
|
+
function buildRubricOutputSchema() {
|
|
3147
|
+
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
3148
|
+
You must return a valid JSON object matching this schema:
|
|
3149
|
+
{
|
|
3150
|
+
"checks": [
|
|
3151
|
+
{
|
|
3152
|
+
"id": "string (rubric id)",
|
|
3153
|
+
"satisfied": boolean,
|
|
3154
|
+
"reasoning": "string (brief explanation)"
|
|
3155
|
+
}
|
|
3156
|
+
],
|
|
3157
|
+
"overall_reasoning": "string (summary)"
|
|
3158
|
+
}`;
|
|
3159
|
+
}
|
|
3160
|
+
function scoreToVerdict(score) {
|
|
3161
|
+
if (score >= 0.8) {
|
|
3162
|
+
return "pass";
|
|
3163
|
+
}
|
|
3164
|
+
if (score >= 0.6) {
|
|
3165
|
+
return "borderline";
|
|
3166
|
+
}
|
|
3167
|
+
return "fail";
|
|
3168
|
+
}
|
|
3168
3169
|
function clampScore(value) {
|
|
3169
3170
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
3170
3171
|
return 0;
|
|
@@ -3177,71 +3178,15 @@ function clampScore(value) {
|
|
|
3177
3178
|
}
|
|
3178
3179
|
return value;
|
|
3179
3180
|
}
|
|
3180
|
-
function parseQualityResponse(response) {
|
|
3181
|
-
const text = typeof response.text === "string" ? response.text.trim() : "";
|
|
3182
|
-
if (text.length === 0) {
|
|
3183
|
-
return {};
|
|
3184
|
-
}
|
|
3185
|
-
const direct = attemptParseJson(text);
|
|
3186
|
-
if (direct && validateQualityJson(direct)) {
|
|
3187
|
-
return direct;
|
|
3188
|
-
}
|
|
3189
|
-
const extracted = extractJsonBlob(text);
|
|
3190
|
-
if (extracted) {
|
|
3191
|
-
const parsed = attemptParseJson(extracted);
|
|
3192
|
-
if (parsed && validateQualityJson(parsed)) {
|
|
3193
|
-
return parsed;
|
|
3194
|
-
}
|
|
3195
|
-
}
|
|
3196
|
-
return {};
|
|
3197
|
-
}
|
|
3198
|
-
function attemptParseJson(text) {
|
|
3199
|
-
try {
|
|
3200
|
-
const parsed = JSON.parse(text);
|
|
3201
|
-
const score = typeof parsed.score === "number" ? parsed.score : void 0;
|
|
3202
|
-
const hits = parsed.hits;
|
|
3203
|
-
const misses = parsed.misses;
|
|
3204
|
-
const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3205
|
-
return { score, hits, misses, reasoning };
|
|
3206
|
-
} catch {
|
|
3207
|
-
return void 0;
|
|
3208
|
-
}
|
|
3209
|
-
}
|
|
3210
|
-
function validateQualityJson(parsed) {
|
|
3211
|
-
if (typeof parsed.score !== "number") {
|
|
3212
|
-
return false;
|
|
3213
|
-
}
|
|
3214
|
-
if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
|
|
3215
|
-
return false;
|
|
3216
|
-
}
|
|
3217
|
-
if (parsed.score < 0 || parsed.score > 1) {
|
|
3218
|
-
return false;
|
|
3219
|
-
}
|
|
3220
|
-
if (parsed.hits !== void 0) {
|
|
3221
|
-
if (!Array.isArray(parsed.hits)) {
|
|
3222
|
-
return false;
|
|
3223
|
-
}
|
|
3224
|
-
if (!parsed.hits.every((item) => typeof item === "string")) {
|
|
3225
|
-
return false;
|
|
3226
|
-
}
|
|
3227
|
-
}
|
|
3228
|
-
if (parsed.misses !== void 0) {
|
|
3229
|
-
if (!Array.isArray(parsed.misses)) {
|
|
3230
|
-
return false;
|
|
3231
|
-
}
|
|
3232
|
-
if (!parsed.misses.every((item) => typeof item === "string")) {
|
|
3233
|
-
return false;
|
|
3234
|
-
}
|
|
3235
|
-
}
|
|
3236
|
-
if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
|
|
3237
|
-
return false;
|
|
3238
|
-
}
|
|
3239
|
-
return true;
|
|
3240
|
-
}
|
|
3241
3181
|
function extractJsonBlob(text) {
|
|
3242
3182
|
const match = text.match(/\{[\s\S]*\}/);
|
|
3243
3183
|
return match?.[0];
|
|
3244
3184
|
}
|
|
3185
|
+
function parseJsonFromText(text) {
|
|
3186
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
3187
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
3188
|
+
return JSON.parse(blob);
|
|
3189
|
+
}
|
|
3245
3190
|
function isNonEmptyString(value) {
|
|
3246
3191
|
return typeof value === "string" && value.trim().length > 0;
|
|
3247
3192
|
}
|
|
@@ -3278,6 +3223,7 @@ var CodeEvaluator = class {
|
|
|
3278
3223
|
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3279
3224
|
return {
|
|
3280
3225
|
score,
|
|
3226
|
+
verdict: scoreToVerdict(score),
|
|
3281
3227
|
hits,
|
|
3282
3228
|
misses,
|
|
3283
3229
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
@@ -3291,6 +3237,7 @@ var CodeEvaluator = class {
|
|
|
3291
3237
|
const message = error instanceof Error ? error.message : String(error);
|
|
3292
3238
|
return {
|
|
3293
3239
|
score: 0,
|
|
3240
|
+
verdict: "fail",
|
|
3294
3241
|
hits: [],
|
|
3295
3242
|
misses: [`Code evaluator failed: ${message}`],
|
|
3296
3243
|
expectedAspectCount: 1,
|
|
@@ -3304,6 +3251,33 @@ var CodeEvaluator = class {
|
|
|
3304
3251
|
}
|
|
3305
3252
|
}
|
|
3306
3253
|
};
|
|
3254
|
+
function calculateRubricScore(result, rubrics) {
|
|
3255
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
3256
|
+
const hits = [];
|
|
3257
|
+
const misses = [];
|
|
3258
|
+
let totalWeight = 0;
|
|
3259
|
+
let earnedWeight = 0;
|
|
3260
|
+
let failedRequired = false;
|
|
3261
|
+
for (const check of result.checks) {
|
|
3262
|
+
const rubric = rubricMap.get(check.id);
|
|
3263
|
+
if (!rubric) {
|
|
3264
|
+
continue;
|
|
3265
|
+
}
|
|
3266
|
+
totalWeight += rubric.weight;
|
|
3267
|
+
if (check.satisfied) {
|
|
3268
|
+
earnedWeight += rubric.weight;
|
|
3269
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3270
|
+
} else {
|
|
3271
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3272
|
+
if (rubric.required) {
|
|
3273
|
+
failedRequired = true;
|
|
3274
|
+
}
|
|
3275
|
+
}
|
|
3276
|
+
}
|
|
3277
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
3278
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
3279
|
+
return { score, verdict, hits, misses };
|
|
3280
|
+
}
|
|
3307
3281
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
3308
3282
|
const { spawn: spawn2 } = await import("node:child_process");
|
|
3309
3283
|
return await new Promise((resolve, reject) => {
|
|
@@ -4040,7 +4014,6 @@ async function runEvaluatorList(options) {
|
|
|
4040
4014
|
reasoning: score2.reasoning,
|
|
4041
4015
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4042
4016
|
});
|
|
4043
|
-
continue;
|
|
4044
4017
|
}
|
|
4045
4018
|
if (evaluator.type === "code") {
|
|
4046
4019
|
const codeEvaluator = new CodeEvaluator({
|
|
@@ -4068,44 +4041,12 @@ async function runEvaluatorList(options) {
|
|
|
4068
4041
|
reasoning: score2.reasoning,
|
|
4069
4042
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4070
4043
|
});
|
|
4071
|
-
continue;
|
|
4072
|
-
}
|
|
4073
|
-
if (evaluator.type === "rubric") {
|
|
4074
|
-
const rubricEvaluator = new RubricEvaluator({
|
|
4075
|
-
config: evaluator,
|
|
4076
|
-
resolveJudgeProvider: async (context) => {
|
|
4077
|
-
if (context.judgeProvider) {
|
|
4078
|
-
return context.judgeProvider;
|
|
4079
|
-
}
|
|
4080
|
-
return judgeProvider;
|
|
4081
|
-
}
|
|
4082
|
-
});
|
|
4083
|
-
const score2 = await rubricEvaluator.evaluate({
|
|
4084
|
-
evalCase,
|
|
4085
|
-
candidate,
|
|
4086
|
-
target,
|
|
4087
|
-
provider,
|
|
4088
|
-
attempt,
|
|
4089
|
-
promptInputs,
|
|
4090
|
-
now,
|
|
4091
|
-
judgeProvider
|
|
4092
|
-
});
|
|
4093
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4094
|
-
evaluatorResults.push({
|
|
4095
|
-
name: evaluator.name,
|
|
4096
|
-
type: evaluator.type,
|
|
4097
|
-
score: score2.score,
|
|
4098
|
-
verdict: score2.verdict,
|
|
4099
|
-
hits: score2.hits,
|
|
4100
|
-
misses: score2.misses,
|
|
4101
|
-
reasoning: score2.reasoning,
|
|
4102
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4103
|
-
});
|
|
4104
4044
|
}
|
|
4105
4045
|
} catch (error) {
|
|
4106
4046
|
const message = error instanceof Error ? error.message : String(error);
|
|
4107
4047
|
const fallbackScore = {
|
|
4108
4048
|
score: 0,
|
|
4049
|
+
verdict: "fail",
|
|
4109
4050
|
hits: [],
|
|
4110
4051
|
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
4111
4052
|
expectedAspectCount: 1,
|
|
@@ -4120,6 +4061,7 @@ async function runEvaluatorList(options) {
|
|
|
4120
4061
|
name: evaluator.name ?? "unknown",
|
|
4121
4062
|
type: evaluator.type ?? "unknown",
|
|
4122
4063
|
score: 0,
|
|
4064
|
+
verdict: "fail",
|
|
4123
4065
|
hits: [],
|
|
4124
4066
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
4125
4067
|
reasoning: message
|
|
@@ -4138,6 +4080,7 @@ async function runEvaluatorList(options) {
|
|
|
4138
4080
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
4139
4081
|
const score = {
|
|
4140
4082
|
score: aggregateScore,
|
|
4083
|
+
verdict: scoreToVerdict2(aggregateScore),
|
|
4141
4084
|
hits,
|
|
4142
4085
|
misses,
|
|
4143
4086
|
expectedAspectCount,
|
|
@@ -4188,6 +4131,15 @@ async function resolveCustomPrompt(config) {
|
|
|
4188
4131
|
function isNonEmptyString2(value) {
|
|
4189
4132
|
return typeof value === "string" && value.trim().length > 0;
|
|
4190
4133
|
}
|
|
4134
|
+
function scoreToVerdict2(score) {
|
|
4135
|
+
if (score >= 0.8) {
|
|
4136
|
+
return "pass";
|
|
4137
|
+
}
|
|
4138
|
+
if (score >= 0.6) {
|
|
4139
|
+
return "borderline";
|
|
4140
|
+
}
|
|
4141
|
+
return "fail";
|
|
4142
|
+
}
|
|
4191
4143
|
function filterEvalCases(evalCases, evalId) {
|
|
4192
4144
|
if (!evalId) {
|
|
4193
4145
|
return evalCases;
|
|
@@ -4413,7 +4365,6 @@ function createAgentKernel() {
|
|
|
4413
4365
|
export {
|
|
4414
4366
|
CodeEvaluator,
|
|
4415
4367
|
LlmJudgeEvaluator,
|
|
4416
|
-
RubricEvaluator,
|
|
4417
4368
|
TEST_MESSAGE_ROLES,
|
|
4418
4369
|
buildDirectoryChain,
|
|
4419
4370
|
buildPromptInputs,
|