agentv 0.21.2 → 0.21.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -34929,25 +34929,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34929
34929
|
}
|
|
34930
34930
|
}
|
|
34931
34931
|
const _model = asString2(rawEvaluator.model);
|
|
34932
|
+
const rawRubrics = rawEvaluator.rubrics;
|
|
34933
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
34934
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
34935
|
+
description: asString2(rubric.description) ?? "",
|
|
34936
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
34937
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
34938
|
+
})).filter((r) => r.description.length > 0) : void 0;
|
|
34932
34939
|
if (typeValue === "rubric") {
|
|
34933
|
-
|
|
34934
|
-
if (!Array.isArray(rubrics)) {
|
|
34940
|
+
if (!parsedRubrics) {
|
|
34935
34941
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
|
|
34936
34942
|
continue;
|
|
34937
34943
|
}
|
|
34938
|
-
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
34939
|
-
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
34940
|
-
description: asString2(rubric.description) ?? "",
|
|
34941
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
34942
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
34943
|
-
})).filter((r) => r.description.length > 0);
|
|
34944
34944
|
if (parsedRubrics.length === 0) {
|
|
34945
34945
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
|
|
34946
34946
|
continue;
|
|
34947
34947
|
}
|
|
34948
34948
|
evaluators.push({
|
|
34949
34949
|
name: name16,
|
|
34950
|
-
type: "
|
|
34950
|
+
type: "llm_judge",
|
|
34951
34951
|
rubrics: parsedRubrics
|
|
34952
34952
|
});
|
|
34953
34953
|
continue;
|
|
@@ -34956,7 +34956,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34956
34956
|
name: name16,
|
|
34957
34957
|
type: "llm_judge",
|
|
34958
34958
|
prompt,
|
|
34959
|
-
promptPath
|
|
34959
|
+
promptPath,
|
|
34960
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
34960
34961
|
});
|
|
34961
34962
|
}
|
|
34962
34963
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -35497,7 +35498,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35497
35498
|
if (rubricItems.length > 0) {
|
|
35498
35499
|
const rubricEvaluator = {
|
|
35499
35500
|
name: "rubric",
|
|
35500
|
-
type: "
|
|
35501
|
+
type: "llm_judge",
|
|
35501
35502
|
rubrics: rubricItems
|
|
35502
35503
|
};
|
|
35503
35504
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
@@ -37330,144 +37331,6 @@ function createProvider(target) {
|
|
|
37330
37331
|
}
|
|
37331
37332
|
}
|
|
37332
37333
|
}
|
|
37333
|
-
var rubricCheckResultSchema = external_exports.object({
|
|
37334
|
-
id: external_exports.string().describe("The ID of the rubric item being checked"),
|
|
37335
|
-
satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
|
|
37336
|
-
reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
37337
|
-
});
|
|
37338
|
-
var rubricEvaluationSchema = external_exports.object({
|
|
37339
|
-
checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
37340
|
-
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
|
|
37341
|
-
});
|
|
37342
|
-
var RubricEvaluator = class {
|
|
37343
|
-
kind = "rubric";
|
|
37344
|
-
config;
|
|
37345
|
-
resolveJudgeProvider;
|
|
37346
|
-
constructor(options) {
|
|
37347
|
-
this.config = options.config;
|
|
37348
|
-
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
37349
|
-
}
|
|
37350
|
-
async evaluate(context) {
|
|
37351
|
-
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
37352
|
-
if (!judgeProvider) {
|
|
37353
|
-
throw new Error("No judge provider available for rubric evaluation");
|
|
37354
|
-
}
|
|
37355
|
-
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
37356
|
-
throw new Error(
|
|
37357
|
-
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
37358
|
-
);
|
|
37359
|
-
}
|
|
37360
|
-
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
37361
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
37362
|
-
if (!model) {
|
|
37363
|
-
throw new Error("Judge provider does not support language model interface");
|
|
37364
|
-
}
|
|
37365
|
-
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
37366
|
-
You must return a valid JSON object matching this schema:
|
|
37367
|
-
{
|
|
37368
|
-
"checks": [
|
|
37369
|
-
{
|
|
37370
|
-
"id": "string (rubric id)",
|
|
37371
|
-
"satisfied": boolean,
|
|
37372
|
-
"reasoning": "string (brief explanation)"
|
|
37373
|
-
}
|
|
37374
|
-
],
|
|
37375
|
-
"overall_reasoning": "string (summary)"
|
|
37376
|
-
}`;
|
|
37377
|
-
let result;
|
|
37378
|
-
let lastError;
|
|
37379
|
-
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
37380
|
-
try {
|
|
37381
|
-
const { text: text2 } = await generateText({
|
|
37382
|
-
model,
|
|
37383
|
-
system,
|
|
37384
|
-
prompt
|
|
37385
|
-
});
|
|
37386
|
-
const cleaned = text2.replace(/```json\n?|```/g, "").trim();
|
|
37387
|
-
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
37388
|
-
break;
|
|
37389
|
-
} catch (e) {
|
|
37390
|
-
lastError = e instanceof Error ? e : new Error(String(e));
|
|
37391
|
-
}
|
|
37392
|
-
}
|
|
37393
|
-
if (!result) {
|
|
37394
|
-
throw new Error(
|
|
37395
|
-
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
37396
|
-
);
|
|
37397
|
-
}
|
|
37398
|
-
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
37399
|
-
return {
|
|
37400
|
-
score,
|
|
37401
|
-
verdict,
|
|
37402
|
-
hits,
|
|
37403
|
-
misses,
|
|
37404
|
-
expectedAspectCount: this.config.rubrics.length,
|
|
37405
|
-
reasoning: result.overall_reasoning,
|
|
37406
|
-
evaluatorRawRequest: {
|
|
37407
|
-
prompt
|
|
37408
|
-
}
|
|
37409
|
-
};
|
|
37410
|
-
}
|
|
37411
|
-
buildPrompt(context, rubrics) {
|
|
37412
|
-
const parts = [
|
|
37413
|
-
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
37414
|
-
"",
|
|
37415
|
-
"[[ ## question ## ]]",
|
|
37416
|
-
context.evalCase.question,
|
|
37417
|
-
"",
|
|
37418
|
-
"[[ ## expected_outcome ## ]]",
|
|
37419
|
-
context.evalCase.expected_outcome,
|
|
37420
|
-
""
|
|
37421
|
-
];
|
|
37422
|
-
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
37423
|
-
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
37424
|
-
}
|
|
37425
|
-
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
37426
|
-
for (const rubric of rubrics) {
|
|
37427
|
-
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
37428
|
-
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
37429
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
37430
|
-
}
|
|
37431
|
-
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
37432
|
-
return parts.join("\n");
|
|
37433
|
-
}
|
|
37434
|
-
calculateScore(result, rubrics) {
|
|
37435
|
-
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
37436
|
-
const hits = [];
|
|
37437
|
-
const misses = [];
|
|
37438
|
-
let totalWeight = 0;
|
|
37439
|
-
let earnedWeight = 0;
|
|
37440
|
-
let failedRequired = false;
|
|
37441
|
-
for (const check2 of result.checks) {
|
|
37442
|
-
const rubric = rubricMap.get(check2.id);
|
|
37443
|
-
if (!rubric) {
|
|
37444
|
-
continue;
|
|
37445
|
-
}
|
|
37446
|
-
totalWeight += rubric.weight;
|
|
37447
|
-
if (check2.satisfied) {
|
|
37448
|
-
earnedWeight += rubric.weight;
|
|
37449
|
-
hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37450
|
-
} else {
|
|
37451
|
-
misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37452
|
-
if (rubric.required) {
|
|
37453
|
-
failedRequired = true;
|
|
37454
|
-
}
|
|
37455
|
-
}
|
|
37456
|
-
}
|
|
37457
|
-
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
37458
|
-
let verdict;
|
|
37459
|
-
if (failedRequired) {
|
|
37460
|
-
verdict = "fail";
|
|
37461
|
-
} else if (score >= 0.8) {
|
|
37462
|
-
verdict = "pass";
|
|
37463
|
-
} else if (score >= 0.6) {
|
|
37464
|
-
verdict = "borderline";
|
|
37465
|
-
} else {
|
|
37466
|
-
verdict = "fail";
|
|
37467
|
-
}
|
|
37468
|
-
return { score, verdict, hits, misses };
|
|
37469
|
-
}
|
|
37470
|
-
};
|
|
37471
37334
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
37472
37335
|
|
|
37473
37336
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -37485,6 +37348,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
37485
37348
|
|
|
37486
37349
|
[[ ## candidate_answer ## ]]
|
|
37487
37350
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
37351
|
+
var freeformEvaluationSchema = external_exports.object({
|
|
37352
|
+
score: external_exports.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
37353
|
+
hits: external_exports.array(external_exports.string()).describe("Brief specific achievements").optional(),
|
|
37354
|
+
misses: external_exports.array(external_exports.string()).describe("Brief failures or omissions").optional(),
|
|
37355
|
+
reasoning: external_exports.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
37356
|
+
});
|
|
37357
|
+
var rubricCheckResultSchema = external_exports.object({
|
|
37358
|
+
id: external_exports.string().describe("The ID of the rubric item being checked"),
|
|
37359
|
+
satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
|
|
37360
|
+
reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
37361
|
+
});
|
|
37362
|
+
var rubricEvaluationSchema = external_exports.object({
|
|
37363
|
+
checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
37364
|
+
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
|
|
37365
|
+
});
|
|
37488
37366
|
var LlmJudgeEvaluator = class {
|
|
37489
37367
|
kind = "llm_judge";
|
|
37490
37368
|
resolveJudgeProvider;
|
|
@@ -37502,9 +37380,13 @@ var LlmJudgeEvaluator = class {
|
|
|
37502
37380
|
if (!judgeProvider) {
|
|
37503
37381
|
throw new Error("No judge provider available for LLM grading");
|
|
37504
37382
|
}
|
|
37505
|
-
|
|
37383
|
+
const config2 = context.evaluator;
|
|
37384
|
+
if (config2?.type === "llm_judge" && config2.rubrics && config2.rubrics.length > 0) {
|
|
37385
|
+
return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
|
|
37386
|
+
}
|
|
37387
|
+
return this.evaluateFreeform(context, judgeProvider);
|
|
37506
37388
|
}
|
|
37507
|
-
async
|
|
37389
|
+
async evaluateFreeform(context, judgeProvider) {
|
|
37508
37390
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
37509
37391
|
const variables = {
|
|
37510
37392
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
@@ -37521,34 +37403,132 @@ var LlmJudgeEvaluator = class {
|
|
|
37521
37403
|
const systemPrompt = buildOutputSchema();
|
|
37522
37404
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
37523
37405
|
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
37524
|
-
const response = await judgeProvider.invoke({
|
|
37525
|
-
question: userPrompt,
|
|
37526
|
-
systemPrompt,
|
|
37527
|
-
evalCaseId: context.evalCase.id,
|
|
37528
|
-
attempt: context.attempt,
|
|
37529
|
-
maxOutputTokens: this.maxOutputTokens,
|
|
37530
|
-
temperature: this.temperature
|
|
37531
|
-
});
|
|
37532
|
-
const parsed = parseQualityResponse(response);
|
|
37533
|
-
const score = clampScore(parsed.score ?? 0);
|
|
37534
|
-
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37535
|
-
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37536
|
-
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
37537
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
37538
37406
|
const evaluatorRawRequest = {
|
|
37539
37407
|
userPrompt,
|
|
37540
37408
|
systemPrompt,
|
|
37541
37409
|
target: judgeProvider.targetName
|
|
37542
37410
|
};
|
|
37411
|
+
try {
|
|
37412
|
+
const { data, providerResponse } = await this.runWithRetry({
|
|
37413
|
+
context,
|
|
37414
|
+
judgeProvider,
|
|
37415
|
+
systemPrompt,
|
|
37416
|
+
userPrompt,
|
|
37417
|
+
schema: freeformEvaluationSchema
|
|
37418
|
+
});
|
|
37419
|
+
const score = clampScore(data.score);
|
|
37420
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37421
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37422
|
+
const reasoning = data.reasoning ?? providerResponse?.reasoning;
|
|
37423
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
37424
|
+
return {
|
|
37425
|
+
score,
|
|
37426
|
+
verdict: scoreToVerdict(score),
|
|
37427
|
+
hits,
|
|
37428
|
+
misses,
|
|
37429
|
+
expectedAspectCount,
|
|
37430
|
+
reasoning,
|
|
37431
|
+
evaluatorRawRequest
|
|
37432
|
+
};
|
|
37433
|
+
} catch {
|
|
37434
|
+
return {
|
|
37435
|
+
score: 0,
|
|
37436
|
+
verdict: "fail",
|
|
37437
|
+
hits: [],
|
|
37438
|
+
misses: [],
|
|
37439
|
+
expectedAspectCount: 1,
|
|
37440
|
+
evaluatorRawRequest
|
|
37441
|
+
};
|
|
37442
|
+
}
|
|
37443
|
+
}
|
|
37444
|
+
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
37445
|
+
if (!rubrics || rubrics.length === 0) {
|
|
37446
|
+
throw new Error(
|
|
37447
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
37448
|
+
);
|
|
37449
|
+
}
|
|
37450
|
+
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
37451
|
+
const systemPrompt = buildRubricOutputSchema();
|
|
37452
|
+
const evaluatorRawRequest = {
|
|
37453
|
+
userPrompt: prompt,
|
|
37454
|
+
systemPrompt,
|
|
37455
|
+
target: judgeProvider.targetName
|
|
37456
|
+
};
|
|
37457
|
+
const { data } = await this.runWithRetry({
|
|
37458
|
+
context,
|
|
37459
|
+
judgeProvider,
|
|
37460
|
+
systemPrompt,
|
|
37461
|
+
userPrompt: prompt,
|
|
37462
|
+
schema: rubricEvaluationSchema
|
|
37463
|
+
});
|
|
37464
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
37543
37465
|
return {
|
|
37544
37466
|
score,
|
|
37467
|
+
verdict,
|
|
37545
37468
|
hits,
|
|
37546
37469
|
misses,
|
|
37547
|
-
expectedAspectCount,
|
|
37548
|
-
reasoning,
|
|
37470
|
+
expectedAspectCount: rubrics.length,
|
|
37471
|
+
reasoning: data.overall_reasoning,
|
|
37549
37472
|
evaluatorRawRequest
|
|
37550
37473
|
};
|
|
37551
37474
|
}
|
|
37475
|
+
buildRubricPrompt(context, rubrics) {
|
|
37476
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
37477
|
+
const parts = [
|
|
37478
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
37479
|
+
"",
|
|
37480
|
+
"[[ ## question ## ]]",
|
|
37481
|
+
formattedQuestion,
|
|
37482
|
+
"",
|
|
37483
|
+
"[[ ## expected_outcome ## ]]",
|
|
37484
|
+
context.evalCase.expected_outcome,
|
|
37485
|
+
""
|
|
37486
|
+
];
|
|
37487
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
37488
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
37489
|
+
}
|
|
37490
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
37491
|
+
for (const rubric of rubrics) {
|
|
37492
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
37493
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
37494
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
37495
|
+
}
|
|
37496
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
37497
|
+
return parts.join("\n");
|
|
37498
|
+
}
|
|
37499
|
+
async runWithRetry(options) {
|
|
37500
|
+
const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
|
|
37501
|
+
let lastError;
|
|
37502
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
37503
|
+
try {
|
|
37504
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
37505
|
+
if (model) {
|
|
37506
|
+
const { text: text2 } = await generateText({
|
|
37507
|
+
model,
|
|
37508
|
+
system: systemPrompt,
|
|
37509
|
+
prompt: userPrompt,
|
|
37510
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
37511
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
37512
|
+
});
|
|
37513
|
+
const data2 = schema.parse(parseJsonFromText(text2));
|
|
37514
|
+
return { data: data2 };
|
|
37515
|
+
}
|
|
37516
|
+
const response = await judgeProvider.invoke({
|
|
37517
|
+
question: userPrompt,
|
|
37518
|
+
systemPrompt,
|
|
37519
|
+
evalCaseId: context.evalCase.id,
|
|
37520
|
+
attempt: context.attempt,
|
|
37521
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
37522
|
+
temperature: this.temperature
|
|
37523
|
+
});
|
|
37524
|
+
const data = schema.parse(parseJsonFromText(response.text ?? ""));
|
|
37525
|
+
return { data, providerResponse: response };
|
|
37526
|
+
} catch (e) {
|
|
37527
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
37528
|
+
}
|
|
37529
|
+
}
|
|
37530
|
+
throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
|
|
37531
|
+
}
|
|
37552
37532
|
};
|
|
37553
37533
|
function buildOutputSchema() {
|
|
37554
37534
|
return [
|
|
@@ -37562,6 +37542,29 @@ function buildOutputSchema() {
|
|
|
37562
37542
|
"}"
|
|
37563
37543
|
].join("\n");
|
|
37564
37544
|
}
|
|
37545
|
+
function buildRubricOutputSchema() {
|
|
37546
|
+
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
37547
|
+
You must return a valid JSON object matching this schema:
|
|
37548
|
+
{
|
|
37549
|
+
"checks": [
|
|
37550
|
+
{
|
|
37551
|
+
"id": "string (rubric id)",
|
|
37552
|
+
"satisfied": boolean,
|
|
37553
|
+
"reasoning": "string (brief explanation)"
|
|
37554
|
+
}
|
|
37555
|
+
],
|
|
37556
|
+
"overall_reasoning": "string (summary)"
|
|
37557
|
+
}`;
|
|
37558
|
+
}
|
|
37559
|
+
function scoreToVerdict(score) {
|
|
37560
|
+
if (score >= 0.8) {
|
|
37561
|
+
return "pass";
|
|
37562
|
+
}
|
|
37563
|
+
if (score >= 0.6) {
|
|
37564
|
+
return "borderline";
|
|
37565
|
+
}
|
|
37566
|
+
return "fail";
|
|
37567
|
+
}
|
|
37565
37568
|
function clampScore(value) {
|
|
37566
37569
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
37567
37570
|
return 0;
|
|
@@ -37574,71 +37577,15 @@ function clampScore(value) {
|
|
|
37574
37577
|
}
|
|
37575
37578
|
return value;
|
|
37576
37579
|
}
|
|
37577
|
-
function parseQualityResponse(response) {
|
|
37578
|
-
const text2 = typeof response.text === "string" ? response.text.trim() : "";
|
|
37579
|
-
if (text2.length === 0) {
|
|
37580
|
-
return {};
|
|
37581
|
-
}
|
|
37582
|
-
const direct = attemptParseJson(text2);
|
|
37583
|
-
if (direct && validateQualityJson(direct)) {
|
|
37584
|
-
return direct;
|
|
37585
|
-
}
|
|
37586
|
-
const extracted = extractJsonBlob(text2);
|
|
37587
|
-
if (extracted) {
|
|
37588
|
-
const parsed = attemptParseJson(extracted);
|
|
37589
|
-
if (parsed && validateQualityJson(parsed)) {
|
|
37590
|
-
return parsed;
|
|
37591
|
-
}
|
|
37592
|
-
}
|
|
37593
|
-
return {};
|
|
37594
|
-
}
|
|
37595
|
-
function attemptParseJson(text2) {
|
|
37596
|
-
try {
|
|
37597
|
-
const parsed = JSON.parse(text2);
|
|
37598
|
-
const score = typeof parsed.score === "number" ? parsed.score : void 0;
|
|
37599
|
-
const hits = parsed.hits;
|
|
37600
|
-
const misses = parsed.misses;
|
|
37601
|
-
const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
|
|
37602
|
-
return { score, hits, misses, reasoning };
|
|
37603
|
-
} catch {
|
|
37604
|
-
return void 0;
|
|
37605
|
-
}
|
|
37606
|
-
}
|
|
37607
|
-
function validateQualityJson(parsed) {
|
|
37608
|
-
if (typeof parsed.score !== "number") {
|
|
37609
|
-
return false;
|
|
37610
|
-
}
|
|
37611
|
-
if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
|
|
37612
|
-
return false;
|
|
37613
|
-
}
|
|
37614
|
-
if (parsed.score < 0 || parsed.score > 1) {
|
|
37615
|
-
return false;
|
|
37616
|
-
}
|
|
37617
|
-
if (parsed.hits !== void 0) {
|
|
37618
|
-
if (!Array.isArray(parsed.hits)) {
|
|
37619
|
-
return false;
|
|
37620
|
-
}
|
|
37621
|
-
if (!parsed.hits.every((item) => typeof item === "string")) {
|
|
37622
|
-
return false;
|
|
37623
|
-
}
|
|
37624
|
-
}
|
|
37625
|
-
if (parsed.misses !== void 0) {
|
|
37626
|
-
if (!Array.isArray(parsed.misses)) {
|
|
37627
|
-
return false;
|
|
37628
|
-
}
|
|
37629
|
-
if (!parsed.misses.every((item) => typeof item === "string")) {
|
|
37630
|
-
return false;
|
|
37631
|
-
}
|
|
37632
|
-
}
|
|
37633
|
-
if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
|
|
37634
|
-
return false;
|
|
37635
|
-
}
|
|
37636
|
-
return true;
|
|
37637
|
-
}
|
|
37638
37580
|
function extractJsonBlob(text2) {
|
|
37639
37581
|
const match = text2.match(/\{[\s\S]*\}/);
|
|
37640
37582
|
return match?.[0];
|
|
37641
37583
|
}
|
|
37584
|
+
function parseJsonFromText(text2) {
|
|
37585
|
+
const cleaned = typeof text2 === "string" ? text2.replace(/```json\n?|```/g, "").trim() : "";
|
|
37586
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
37587
|
+
return JSON.parse(blob);
|
|
37588
|
+
}
|
|
37642
37589
|
function isNonEmptyString(value) {
|
|
37643
37590
|
return typeof value === "string" && value.trim().length > 0;
|
|
37644
37591
|
}
|
|
@@ -37675,6 +37622,7 @@ var CodeEvaluator = class {
|
|
|
37675
37622
|
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
37676
37623
|
return {
|
|
37677
37624
|
score,
|
|
37625
|
+
verdict: scoreToVerdict(score),
|
|
37678
37626
|
hits,
|
|
37679
37627
|
misses,
|
|
37680
37628
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
@@ -37688,6 +37636,7 @@ var CodeEvaluator = class {
|
|
|
37688
37636
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
37689
37637
|
return {
|
|
37690
37638
|
score: 0,
|
|
37639
|
+
verdict: "fail",
|
|
37691
37640
|
hits: [],
|
|
37692
37641
|
misses: [`Code evaluator failed: ${message}`],
|
|
37693
37642
|
expectedAspectCount: 1,
|
|
@@ -37701,6 +37650,33 @@ var CodeEvaluator = class {
|
|
|
37701
37650
|
}
|
|
37702
37651
|
}
|
|
37703
37652
|
};
|
|
37653
|
+
function calculateRubricScore(result, rubrics) {
|
|
37654
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
37655
|
+
const hits = [];
|
|
37656
|
+
const misses = [];
|
|
37657
|
+
let totalWeight = 0;
|
|
37658
|
+
let earnedWeight = 0;
|
|
37659
|
+
let failedRequired = false;
|
|
37660
|
+
for (const check2 of result.checks) {
|
|
37661
|
+
const rubric = rubricMap.get(check2.id);
|
|
37662
|
+
if (!rubric) {
|
|
37663
|
+
continue;
|
|
37664
|
+
}
|
|
37665
|
+
totalWeight += rubric.weight;
|
|
37666
|
+
if (check2.satisfied) {
|
|
37667
|
+
earnedWeight += rubric.weight;
|
|
37668
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37669
|
+
} else {
|
|
37670
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37671
|
+
if (rubric.required) {
|
|
37672
|
+
failedRequired = true;
|
|
37673
|
+
}
|
|
37674
|
+
}
|
|
37675
|
+
}
|
|
37676
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
37677
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
37678
|
+
return { score, verdict, hits, misses };
|
|
37679
|
+
}
|
|
37704
37680
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
37705
37681
|
const { spawn: spawn22 } = await import("node:child_process");
|
|
37706
37682
|
return await new Promise((resolve2, reject) => {
|
|
@@ -38426,7 +38402,6 @@ async function runEvaluatorList(options) {
|
|
|
38426
38402
|
reasoning: score2.reasoning,
|
|
38427
38403
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38428
38404
|
});
|
|
38429
|
-
continue;
|
|
38430
38405
|
}
|
|
38431
38406
|
if (evaluator.type === "code") {
|
|
38432
38407
|
const codeEvaluator = new CodeEvaluator({
|
|
@@ -38454,44 +38429,12 @@ async function runEvaluatorList(options) {
|
|
|
38454
38429
|
reasoning: score2.reasoning,
|
|
38455
38430
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38456
38431
|
});
|
|
38457
|
-
continue;
|
|
38458
|
-
}
|
|
38459
|
-
if (evaluator.type === "rubric") {
|
|
38460
|
-
const rubricEvaluator = new RubricEvaluator({
|
|
38461
|
-
config: evaluator,
|
|
38462
|
-
resolveJudgeProvider: async (context) => {
|
|
38463
|
-
if (context.judgeProvider) {
|
|
38464
|
-
return context.judgeProvider;
|
|
38465
|
-
}
|
|
38466
|
-
return judgeProvider;
|
|
38467
|
-
}
|
|
38468
|
-
});
|
|
38469
|
-
const score2 = await rubricEvaluator.evaluate({
|
|
38470
|
-
evalCase,
|
|
38471
|
-
candidate,
|
|
38472
|
-
target,
|
|
38473
|
-
provider,
|
|
38474
|
-
attempt,
|
|
38475
|
-
promptInputs,
|
|
38476
|
-
now,
|
|
38477
|
-
judgeProvider
|
|
38478
|
-
});
|
|
38479
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
38480
|
-
evaluatorResults.push({
|
|
38481
|
-
name: evaluator.name,
|
|
38482
|
-
type: evaluator.type,
|
|
38483
|
-
score: score2.score,
|
|
38484
|
-
verdict: score2.verdict,
|
|
38485
|
-
hits: score2.hits,
|
|
38486
|
-
misses: score2.misses,
|
|
38487
|
-
reasoning: score2.reasoning,
|
|
38488
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38489
|
-
});
|
|
38490
38432
|
}
|
|
38491
38433
|
} catch (error40) {
|
|
38492
38434
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
38493
38435
|
const fallbackScore = {
|
|
38494
38436
|
score: 0,
|
|
38437
|
+
verdict: "fail",
|
|
38495
38438
|
hits: [],
|
|
38496
38439
|
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
38497
38440
|
expectedAspectCount: 1,
|
|
@@ -38506,6 +38449,7 @@ async function runEvaluatorList(options) {
|
|
|
38506
38449
|
name: evaluator.name ?? "unknown",
|
|
38507
38450
|
type: evaluator.type ?? "unknown",
|
|
38508
38451
|
score: 0,
|
|
38452
|
+
verdict: "fail",
|
|
38509
38453
|
hits: [],
|
|
38510
38454
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
38511
38455
|
reasoning: message
|
|
@@ -38524,6 +38468,7 @@ async function runEvaluatorList(options) {
|
|
|
38524
38468
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
38525
38469
|
const score = {
|
|
38526
38470
|
score: aggregateScore,
|
|
38471
|
+
verdict: scoreToVerdict2(aggregateScore),
|
|
38527
38472
|
hits,
|
|
38528
38473
|
misses,
|
|
38529
38474
|
expectedAspectCount,
|
|
@@ -38574,6 +38519,15 @@ async function resolveCustomPrompt(config2) {
|
|
|
38574
38519
|
function isNonEmptyString2(value) {
|
|
38575
38520
|
return typeof value === "string" && value.trim().length > 0;
|
|
38576
38521
|
}
|
|
38522
|
+
function scoreToVerdict2(score) {
|
|
38523
|
+
if (score >= 0.8) {
|
|
38524
|
+
return "pass";
|
|
38525
|
+
}
|
|
38526
|
+
if (score >= 0.6) {
|
|
38527
|
+
return "borderline";
|
|
38528
|
+
}
|
|
38529
|
+
return "fail";
|
|
38530
|
+
}
|
|
38577
38531
|
function filterEvalCases(evalCases, evalId) {
|
|
38578
38532
|
if (!evalId) {
|
|
38579
38533
|
return evalCases;
|
|
@@ -41713,4 +41667,4 @@ export {
|
|
|
41713
41667
|
app,
|
|
41714
41668
|
runCli
|
|
41715
41669
|
};
|
|
41716
|
-
//# sourceMappingURL=chunk-
|
|
41670
|
+
//# sourceMappingURL=chunk-A5T7W63L.js.map
|