agentv 0.21.2 → 0.21.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34929,25 +34929,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34929
34929
  }
34930
34930
  }
34931
34931
  const _model = asString2(rawEvaluator.model);
34932
+ const rawRubrics = rawEvaluator.rubrics;
34933
+ const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
34934
+ id: asString2(rubric.id) ?? `rubric-${index + 1}`,
34935
+ description: asString2(rubric.description) ?? "",
34936
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
34937
+ required: typeof rubric.required === "boolean" ? rubric.required : true
34938
+ })).filter((r) => r.description.length > 0) : void 0;
34932
34939
  if (typeValue === "rubric") {
34933
- const rubrics = rawEvaluator.rubrics;
34934
- if (!Array.isArray(rubrics)) {
34940
+ if (!parsedRubrics) {
34935
34941
  logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
34936
34942
  continue;
34937
34943
  }
34938
- const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
34939
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
34940
- description: asString2(rubric.description) ?? "",
34941
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
34942
- required: typeof rubric.required === "boolean" ? rubric.required : true
34943
- })).filter((r) => r.description.length > 0);
34944
34944
  if (parsedRubrics.length === 0) {
34945
34945
  logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
34946
34946
  continue;
34947
34947
  }
34948
34948
  evaluators.push({
34949
34949
  name: name16,
34950
- type: "rubric",
34950
+ type: "llm_judge",
34951
34951
  rubrics: parsedRubrics
34952
34952
  });
34953
34953
  continue;
@@ -34956,7 +34956,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34956
34956
  name: name16,
34957
34957
  type: "llm_judge",
34958
34958
  prompt,
34959
- promptPath
34959
+ promptPath,
34960
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
34960
34961
  });
34961
34962
  }
34962
34963
  return evaluators.length > 0 ? evaluators : void 0;
@@ -35497,7 +35498,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
35497
35498
  if (rubricItems.length > 0) {
35498
35499
  const rubricEvaluator = {
35499
35500
  name: "rubric",
35500
- type: "rubric",
35501
+ type: "llm_judge",
35501
35502
  rubrics: rubricItems
35502
35503
  };
35503
35504
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
@@ -37330,144 +37331,6 @@ function createProvider(target) {
37330
37331
  }
37331
37332
  }
37332
37333
  }
37333
- var rubricCheckResultSchema = external_exports.object({
37334
- id: external_exports.string().describe("The ID of the rubric item being checked"),
37335
- satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
37336
- reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
37337
- });
37338
- var rubricEvaluationSchema = external_exports.object({
37339
- checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
37340
- overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
37341
- });
37342
- var RubricEvaluator = class {
37343
- kind = "rubric";
37344
- config;
37345
- resolveJudgeProvider;
37346
- constructor(options) {
37347
- this.config = options.config;
37348
- this.resolveJudgeProvider = options.resolveJudgeProvider;
37349
- }
37350
- async evaluate(context) {
37351
- const judgeProvider = await this.resolveJudgeProvider(context);
37352
- if (!judgeProvider) {
37353
- throw new Error("No judge provider available for rubric evaluation");
37354
- }
37355
- if (!this.config.rubrics || this.config.rubrics.length === 0) {
37356
- throw new Error(
37357
- `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
37358
- );
37359
- }
37360
- const prompt = this.buildPrompt(context, this.config.rubrics);
37361
- const model = judgeProvider.asLanguageModel?.();
37362
- if (!model) {
37363
- throw new Error("Judge provider does not support language model interface");
37364
- }
37365
- const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
37366
- You must return a valid JSON object matching this schema:
37367
- {
37368
- "checks": [
37369
- {
37370
- "id": "string (rubric id)",
37371
- "satisfied": boolean,
37372
- "reasoning": "string (brief explanation)"
37373
- }
37374
- ],
37375
- "overall_reasoning": "string (summary)"
37376
- }`;
37377
- let result;
37378
- let lastError;
37379
- for (let attempt = 1; attempt <= 3; attempt++) {
37380
- try {
37381
- const { text: text2 } = await generateText({
37382
- model,
37383
- system,
37384
- prompt
37385
- });
37386
- const cleaned = text2.replace(/```json\n?|```/g, "").trim();
37387
- result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
37388
- break;
37389
- } catch (e) {
37390
- lastError = e instanceof Error ? e : new Error(String(e));
37391
- }
37392
- }
37393
- if (!result) {
37394
- throw new Error(
37395
- `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
37396
- );
37397
- }
37398
- const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
37399
- return {
37400
- score,
37401
- verdict,
37402
- hits,
37403
- misses,
37404
- expectedAspectCount: this.config.rubrics.length,
37405
- reasoning: result.overall_reasoning,
37406
- evaluatorRawRequest: {
37407
- prompt
37408
- }
37409
- };
37410
- }
37411
- buildPrompt(context, rubrics) {
37412
- const parts = [
37413
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
37414
- "",
37415
- "[[ ## question ## ]]",
37416
- context.evalCase.question,
37417
- "",
37418
- "[[ ## expected_outcome ## ]]",
37419
- context.evalCase.expected_outcome,
37420
- ""
37421
- ];
37422
- if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
37423
- parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
37424
- }
37425
- parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
37426
- for (const rubric of rubrics) {
37427
- const requiredLabel = rubric.required ? " (REQUIRED)" : "";
37428
- const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
37429
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
37430
- }
37431
- parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
37432
- return parts.join("\n");
37433
- }
37434
- calculateScore(result, rubrics) {
37435
- const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
37436
- const hits = [];
37437
- const misses = [];
37438
- let totalWeight = 0;
37439
- let earnedWeight = 0;
37440
- let failedRequired = false;
37441
- for (const check2 of result.checks) {
37442
- const rubric = rubricMap.get(check2.id);
37443
- if (!rubric) {
37444
- continue;
37445
- }
37446
- totalWeight += rubric.weight;
37447
- if (check2.satisfied) {
37448
- earnedWeight += rubric.weight;
37449
- hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37450
- } else {
37451
- misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37452
- if (rubric.required) {
37453
- failedRequired = true;
37454
- }
37455
- }
37456
- }
37457
- const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
37458
- let verdict;
37459
- if (failedRequired) {
37460
- verdict = "fail";
37461
- } else if (score >= 0.8) {
37462
- verdict = "pass";
37463
- } else if (score >= 0.6) {
37464
- verdict = "borderline";
37465
- } else {
37466
- verdict = "fail";
37467
- }
37468
- return { score, verdict, hits, misses };
37469
- }
37470
- };
37471
37334
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
37472
37335
 
37473
37336
  Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -37485,6 +37348,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
37485
37348
 
37486
37349
  [[ ## candidate_answer ## ]]
37487
37350
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
37351
+ var freeformEvaluationSchema = external_exports.object({
37352
+ score: external_exports.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
37353
+ hits: external_exports.array(external_exports.string()).describe("Brief specific achievements").optional(),
37354
+ misses: external_exports.array(external_exports.string()).describe("Brief failures or omissions").optional(),
37355
+ reasoning: external_exports.string().describe("Concise explanation (1-2 sentences)").optional()
37356
+ });
37357
+ var rubricCheckResultSchema = external_exports.object({
37358
+ id: external_exports.string().describe("The ID of the rubric item being checked"),
37359
+ satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
37360
+ reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
37361
+ });
37362
+ var rubricEvaluationSchema = external_exports.object({
37363
+ checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
37364
+ overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
37365
+ });
37488
37366
  var LlmJudgeEvaluator = class {
37489
37367
  kind = "llm_judge";
37490
37368
  resolveJudgeProvider;
@@ -37502,9 +37380,13 @@ var LlmJudgeEvaluator = class {
37502
37380
  if (!judgeProvider) {
37503
37381
  throw new Error("No judge provider available for LLM grading");
37504
37382
  }
37505
- return this.evaluateWithPrompt(context, judgeProvider);
37383
+ const config2 = context.evaluator;
37384
+ if (config2?.type === "llm_judge" && config2.rubrics && config2.rubrics.length > 0) {
37385
+ return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
37386
+ }
37387
+ return this.evaluateFreeform(context, judgeProvider);
37506
37388
  }
37507
- async evaluateWithPrompt(context, judgeProvider) {
37389
+ async evaluateFreeform(context, judgeProvider) {
37508
37390
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
37509
37391
  const variables = {
37510
37392
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
@@ -37521,34 +37403,132 @@ var LlmJudgeEvaluator = class {
37521
37403
  const systemPrompt = buildOutputSchema();
37522
37404
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
37523
37405
  const userPrompt = substituteVariables(evaluatorTemplate, variables);
37524
- const response = await judgeProvider.invoke({
37525
- question: userPrompt,
37526
- systemPrompt,
37527
- evalCaseId: context.evalCase.id,
37528
- attempt: context.attempt,
37529
- maxOutputTokens: this.maxOutputTokens,
37530
- temperature: this.temperature
37531
- });
37532
- const parsed = parseQualityResponse(response);
37533
- const score = clampScore(parsed.score ?? 0);
37534
- const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
37535
- const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
37536
- const reasoning = parsed.reasoning ?? response.reasoning;
37537
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
37538
37406
  const evaluatorRawRequest = {
37539
37407
  userPrompt,
37540
37408
  systemPrompt,
37541
37409
  target: judgeProvider.targetName
37542
37410
  };
37411
+ try {
37412
+ const { data, providerResponse } = await this.runWithRetry({
37413
+ context,
37414
+ judgeProvider,
37415
+ systemPrompt,
37416
+ userPrompt,
37417
+ schema: freeformEvaluationSchema
37418
+ });
37419
+ const score = clampScore(data.score);
37420
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
37421
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
37422
+ const reasoning = data.reasoning ?? providerResponse?.reasoning;
37423
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
37424
+ return {
37425
+ score,
37426
+ verdict: scoreToVerdict(score),
37427
+ hits,
37428
+ misses,
37429
+ expectedAspectCount,
37430
+ reasoning,
37431
+ evaluatorRawRequest
37432
+ };
37433
+ } catch {
37434
+ return {
37435
+ score: 0,
37436
+ verdict: "fail",
37437
+ hits: [],
37438
+ misses: [],
37439
+ expectedAspectCount: 1,
37440
+ evaluatorRawRequest
37441
+ };
37442
+ }
37443
+ }
37444
+ async evaluateWithRubrics(context, judgeProvider, rubrics) {
37445
+ if (!rubrics || rubrics.length === 0) {
37446
+ throw new Error(
37447
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
37448
+ );
37449
+ }
37450
+ const prompt = this.buildRubricPrompt(context, rubrics);
37451
+ const systemPrompt = buildRubricOutputSchema();
37452
+ const evaluatorRawRequest = {
37453
+ userPrompt: prompt,
37454
+ systemPrompt,
37455
+ target: judgeProvider.targetName
37456
+ };
37457
+ const { data } = await this.runWithRetry({
37458
+ context,
37459
+ judgeProvider,
37460
+ systemPrompt,
37461
+ userPrompt: prompt,
37462
+ schema: rubricEvaluationSchema
37463
+ });
37464
+ const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
37543
37465
  return {
37544
37466
  score,
37467
+ verdict,
37545
37468
  hits,
37546
37469
  misses,
37547
- expectedAspectCount,
37548
- reasoning,
37470
+ expectedAspectCount: rubrics.length,
37471
+ reasoning: data.overall_reasoning,
37549
37472
  evaluatorRawRequest
37550
37473
  };
37551
37474
  }
37475
+ buildRubricPrompt(context, rubrics) {
37476
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
37477
+ const parts = [
37478
+ "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
37479
+ "",
37480
+ "[[ ## question ## ]]",
37481
+ formattedQuestion,
37482
+ "",
37483
+ "[[ ## expected_outcome ## ]]",
37484
+ context.evalCase.expected_outcome,
37485
+ ""
37486
+ ];
37487
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
37488
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
37489
+ }
37490
+ parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
37491
+ for (const rubric of rubrics) {
37492
+ const requiredLabel = rubric.required ? " (REQUIRED)" : "";
37493
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
37494
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
37495
+ }
37496
+ parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
37497
+ return parts.join("\n");
37498
+ }
37499
+ async runWithRetry(options) {
37500
+ const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
37501
+ let lastError;
37502
+ for (let attempt = 1; attempt <= 3; attempt++) {
37503
+ try {
37504
+ const model = judgeProvider.asLanguageModel?.();
37505
+ if (model) {
37506
+ const { text: text2 } = await generateText({
37507
+ model,
37508
+ system: systemPrompt,
37509
+ prompt: userPrompt,
37510
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
37511
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
37512
+ });
37513
+ const data2 = schema.parse(parseJsonFromText(text2));
37514
+ return { data: data2 };
37515
+ }
37516
+ const response = await judgeProvider.invoke({
37517
+ question: userPrompt,
37518
+ systemPrompt,
37519
+ evalCaseId: context.evalCase.id,
37520
+ attempt: context.attempt,
37521
+ maxOutputTokens: this.maxOutputTokens,
37522
+ temperature: this.temperature
37523
+ });
37524
+ const data = schema.parse(parseJsonFromText(response.text ?? ""));
37525
+ return { data, providerResponse: response };
37526
+ } catch (e) {
37527
+ lastError = e instanceof Error ? e : new Error(String(e));
37528
+ }
37529
+ }
37530
+ throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
37531
+ }
37552
37532
  };
37553
37533
  function buildOutputSchema() {
37554
37534
  return [
@@ -37562,6 +37542,29 @@ function buildOutputSchema() {
37562
37542
  "}"
37563
37543
  ].join("\n");
37564
37544
  }
37545
+ function buildRubricOutputSchema() {
37546
+ return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
37547
+ You must return a valid JSON object matching this schema:
37548
+ {
37549
+ "checks": [
37550
+ {
37551
+ "id": "string (rubric id)",
37552
+ "satisfied": boolean,
37553
+ "reasoning": "string (brief explanation)"
37554
+ }
37555
+ ],
37556
+ "overall_reasoning": "string (summary)"
37557
+ }`;
37558
+ }
37559
+ function scoreToVerdict(score) {
37560
+ if (score >= 0.8) {
37561
+ return "pass";
37562
+ }
37563
+ if (score >= 0.6) {
37564
+ return "borderline";
37565
+ }
37566
+ return "fail";
37567
+ }
37565
37568
  function clampScore(value) {
37566
37569
  if (Number.isNaN(value) || !Number.isFinite(value)) {
37567
37570
  return 0;
@@ -37574,71 +37577,15 @@ function clampScore(value) {
37574
37577
  }
37575
37578
  return value;
37576
37579
  }
37577
- function parseQualityResponse(response) {
37578
- const text2 = typeof response.text === "string" ? response.text.trim() : "";
37579
- if (text2.length === 0) {
37580
- return {};
37581
- }
37582
- const direct = attemptParseJson(text2);
37583
- if (direct && validateQualityJson(direct)) {
37584
- return direct;
37585
- }
37586
- const extracted = extractJsonBlob(text2);
37587
- if (extracted) {
37588
- const parsed = attemptParseJson(extracted);
37589
- if (parsed && validateQualityJson(parsed)) {
37590
- return parsed;
37591
- }
37592
- }
37593
- return {};
37594
- }
37595
- function attemptParseJson(text2) {
37596
- try {
37597
- const parsed = JSON.parse(text2);
37598
- const score = typeof parsed.score === "number" ? parsed.score : void 0;
37599
- const hits = parsed.hits;
37600
- const misses = parsed.misses;
37601
- const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
37602
- return { score, hits, misses, reasoning };
37603
- } catch {
37604
- return void 0;
37605
- }
37606
- }
37607
- function validateQualityJson(parsed) {
37608
- if (typeof parsed.score !== "number") {
37609
- return false;
37610
- }
37611
- if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
37612
- return false;
37613
- }
37614
- if (parsed.score < 0 || parsed.score > 1) {
37615
- return false;
37616
- }
37617
- if (parsed.hits !== void 0) {
37618
- if (!Array.isArray(parsed.hits)) {
37619
- return false;
37620
- }
37621
- if (!parsed.hits.every((item) => typeof item === "string")) {
37622
- return false;
37623
- }
37624
- }
37625
- if (parsed.misses !== void 0) {
37626
- if (!Array.isArray(parsed.misses)) {
37627
- return false;
37628
- }
37629
- if (!parsed.misses.every((item) => typeof item === "string")) {
37630
- return false;
37631
- }
37632
- }
37633
- if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
37634
- return false;
37635
- }
37636
- return true;
37637
- }
37638
37580
  function extractJsonBlob(text2) {
37639
37581
  const match = text2.match(/\{[\s\S]*\}/);
37640
37582
  return match?.[0];
37641
37583
  }
37584
+ function parseJsonFromText(text2) {
37585
+ const cleaned = typeof text2 === "string" ? text2.replace(/```json\n?|```/g, "").trim() : "";
37586
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
37587
+ return JSON.parse(blob);
37588
+ }
37642
37589
  function isNonEmptyString(value) {
37643
37590
  return typeof value === "string" && value.trim().length > 0;
37644
37591
  }
@@ -37675,6 +37622,7 @@ var CodeEvaluator = class {
37675
37622
  const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
37676
37623
  return {
37677
37624
  score,
37625
+ verdict: scoreToVerdict(score),
37678
37626
  hits,
37679
37627
  misses,
37680
37628
  expectedAspectCount: hits.length + misses.length || 1,
@@ -37688,6 +37636,7 @@ var CodeEvaluator = class {
37688
37636
  const message = error40 instanceof Error ? error40.message : String(error40);
37689
37637
  return {
37690
37638
  score: 0,
37639
+ verdict: "fail",
37691
37640
  hits: [],
37692
37641
  misses: [`Code evaluator failed: ${message}`],
37693
37642
  expectedAspectCount: 1,
@@ -37701,6 +37650,33 @@ var CodeEvaluator = class {
37701
37650
  }
37702
37651
  }
37703
37652
  };
37653
+ function calculateRubricScore(result, rubrics) {
37654
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
37655
+ const hits = [];
37656
+ const misses = [];
37657
+ let totalWeight = 0;
37658
+ let earnedWeight = 0;
37659
+ let failedRequired = false;
37660
+ for (const check2 of result.checks) {
37661
+ const rubric = rubricMap.get(check2.id);
37662
+ if (!rubric) {
37663
+ continue;
37664
+ }
37665
+ totalWeight += rubric.weight;
37666
+ if (check2.satisfied) {
37667
+ earnedWeight += rubric.weight;
37668
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37669
+ } else {
37670
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37671
+ if (rubric.required) {
37672
+ failedRequired = true;
37673
+ }
37674
+ }
37675
+ }
37676
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
37677
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
37678
+ return { score, verdict, hits, misses };
37679
+ }
37704
37680
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
37705
37681
  const { spawn: spawn22 } = await import("node:child_process");
37706
37682
  return await new Promise((resolve2, reject) => {
@@ -38426,7 +38402,6 @@ async function runEvaluatorList(options) {
38426
38402
  reasoning: score2.reasoning,
38427
38403
  evaluator_provider_request: score2.evaluatorRawRequest
38428
38404
  });
38429
- continue;
38430
38405
  }
38431
38406
  if (evaluator.type === "code") {
38432
38407
  const codeEvaluator = new CodeEvaluator({
@@ -38454,44 +38429,12 @@ async function runEvaluatorList(options) {
38454
38429
  reasoning: score2.reasoning,
38455
38430
  evaluator_provider_request: score2.evaluatorRawRequest
38456
38431
  });
38457
- continue;
38458
- }
38459
- if (evaluator.type === "rubric") {
38460
- const rubricEvaluator = new RubricEvaluator({
38461
- config: evaluator,
38462
- resolveJudgeProvider: async (context) => {
38463
- if (context.judgeProvider) {
38464
- return context.judgeProvider;
38465
- }
38466
- return judgeProvider;
38467
- }
38468
- });
38469
- const score2 = await rubricEvaluator.evaluate({
38470
- evalCase,
38471
- candidate,
38472
- target,
38473
- provider,
38474
- attempt,
38475
- promptInputs,
38476
- now,
38477
- judgeProvider
38478
- });
38479
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
38480
- evaluatorResults.push({
38481
- name: evaluator.name,
38482
- type: evaluator.type,
38483
- score: score2.score,
38484
- verdict: score2.verdict,
38485
- hits: score2.hits,
38486
- misses: score2.misses,
38487
- reasoning: score2.reasoning,
38488
- evaluator_provider_request: score2.evaluatorRawRequest
38489
- });
38490
38432
  }
38491
38433
  } catch (error40) {
38492
38434
  const message = error40 instanceof Error ? error40.message : String(error40);
38493
38435
  const fallbackScore = {
38494
38436
  score: 0,
38437
+ verdict: "fail",
38495
38438
  hits: [],
38496
38439
  misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
38497
38440
  expectedAspectCount: 1,
@@ -38506,6 +38449,7 @@ async function runEvaluatorList(options) {
38506
38449
  name: evaluator.name ?? "unknown",
38507
38450
  type: evaluator.type ?? "unknown",
38508
38451
  score: 0,
38452
+ verdict: "fail",
38509
38453
  hits: [],
38510
38454
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
38511
38455
  reasoning: message
@@ -38524,6 +38468,7 @@ async function runEvaluatorList(options) {
38524
38468
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
38525
38469
  const score = {
38526
38470
  score: aggregateScore,
38471
+ verdict: scoreToVerdict2(aggregateScore),
38527
38472
  hits,
38528
38473
  misses,
38529
38474
  expectedAspectCount,
@@ -38574,6 +38519,15 @@ async function resolveCustomPrompt(config2) {
38574
38519
  function isNonEmptyString2(value) {
38575
38520
  return typeof value === "string" && value.trim().length > 0;
38576
38521
  }
38522
+ function scoreToVerdict2(score) {
38523
+ if (score >= 0.8) {
38524
+ return "pass";
38525
+ }
38526
+ if (score >= 0.6) {
38527
+ return "borderline";
38528
+ }
38529
+ return "fail";
38530
+ }
38577
38531
  function filterEvalCases(evalCases, evalId) {
38578
38532
  if (!evalId) {
38579
38533
  return evalCases;
@@ -41713,4 +41667,4 @@ export {
41713
41667
  app,
41714
41668
  runCli
41715
41669
  };
41716
- //# sourceMappingURL=chunk-WOCXZEH4.js.map
41670
+ //# sourceMappingURL=chunk-A5T7W63L.js.map