agentv 0.21.2 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34562,7 +34562,7 @@ function isTestMessage(value) {
34562
34562
  }
34563
34563
  return candidate.content.every(isJsonObject);
34564
34564
  }
34565
- var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
34565
+ var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
34566
34566
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
34567
34567
  function isEvaluatorKind(value) {
34568
34568
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -34879,10 +34879,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34879
34879
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
34880
34880
  continue;
34881
34881
  }
34882
- if (typeValue === "code") {
34882
+ if (typeValue === "code_judge") {
34883
34883
  const script = asString2(rawEvaluator.script);
34884
34884
  if (!script) {
34885
- logWarning2(`Skipping code evaluator '${name16}' in '${evalId}': missing script`);
34885
+ logWarning2(`Skipping code_judge evaluator '${name16}' in '${evalId}': missing script`);
34886
34886
  continue;
34887
34887
  }
34888
34888
  const cwd = asString2(rawEvaluator.cwd);
@@ -34893,7 +34893,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34893
34893
  resolvedCwd = path32.resolve(resolved.resolvedPath);
34894
34894
  } else {
34895
34895
  logWarning2(
34896
- `Code evaluator '${name16}' in '${evalId}': cwd not found (${resolved.displayPath})`,
34896
+ `Code_judge evaluator '${name16}' in '${evalId}': cwd not found (${resolved.displayPath})`,
34897
34897
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
34898
34898
  );
34899
34899
  }
@@ -34909,6 +34909,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34909
34909
  });
34910
34910
  continue;
34911
34911
  }
34912
+ if (typeValue === "composite") {
34913
+ const rawMembers = rawEvaluator.evaluators;
34914
+ if (!Array.isArray(rawMembers)) {
34915
+ logWarning2(
34916
+ `Skipping composite evaluator '${name16}' in '${evalId}': missing evaluators array`
34917
+ );
34918
+ continue;
34919
+ }
34920
+ const rawAggregator = rawEvaluator.aggregator;
34921
+ if (!isJsonObject2(rawAggregator)) {
34922
+ logWarning2(`Skipping composite evaluator '${name16}' in '${evalId}': missing aggregator`);
34923
+ continue;
34924
+ }
34925
+ const aggregatorType = asString2(rawAggregator.type);
34926
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
34927
+ logWarning2(
34928
+ `Skipping composite evaluator '${name16}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
34929
+ );
34930
+ continue;
34931
+ }
34932
+ const memberEvaluators = [];
34933
+ for (const rawMember of rawMembers) {
34934
+ if (!isJsonObject2(rawMember)) {
34935
+ logWarning2(`Skipping invalid member evaluator in composite '${name16}' (expected object)`);
34936
+ continue;
34937
+ }
34938
+ const memberName = asString2(rawMember.name);
34939
+ const memberType = rawMember.type;
34940
+ if (!memberName || !isEvaluatorKind(memberType)) {
34941
+ logWarning2(`Skipping member evaluator with invalid name/type in composite '${name16}'`);
34942
+ continue;
34943
+ }
34944
+ const memberConfigs = await parseEvaluators(
34945
+ { evaluators: [rawMember] },
34946
+ void 0,
34947
+ searchRoots,
34948
+ `${evalId}:${name16}:${memberName}`
34949
+ );
34950
+ if (memberConfigs && memberConfigs.length > 0) {
34951
+ memberEvaluators.push(memberConfigs[0]);
34952
+ }
34953
+ }
34954
+ if (memberEvaluators.length === 0) {
34955
+ logWarning2(
34956
+ `Skipping composite evaluator '${name16}' in '${evalId}': no valid member evaluators`
34957
+ );
34958
+ continue;
34959
+ }
34960
+ let aggregator;
34961
+ if (aggregatorType === "weighted_average") {
34962
+ const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
34963
+ const parsedWeights = {};
34964
+ if (weights) {
34965
+ for (const [key2, value] of Object.entries(weights)) {
34966
+ if (typeof value === "number") {
34967
+ parsedWeights[key2] = value;
34968
+ }
34969
+ }
34970
+ }
34971
+ aggregator = {
34972
+ type: "weighted_average",
34973
+ ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
34974
+ };
34975
+ } else if (aggregatorType === "code_judge") {
34976
+ const aggregatorPath = asString2(rawAggregator.path);
34977
+ if (!aggregatorPath) {
34978
+ logWarning2(
34979
+ `Skipping composite evaluator '${name16}' in '${evalId}': code_judge aggregator missing path`
34980
+ );
34981
+ continue;
34982
+ }
34983
+ aggregator = {
34984
+ type: "code_judge",
34985
+ path: aggregatorPath,
34986
+ cwd: searchRoots[0]
34987
+ };
34988
+ } else {
34989
+ const aggregatorPrompt = asString2(rawAggregator.prompt);
34990
+ let promptPath2;
34991
+ if (aggregatorPrompt) {
34992
+ const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
34993
+ if (resolved.resolvedPath) {
34994
+ promptPath2 = path32.resolve(resolved.resolvedPath);
34995
+ }
34996
+ }
34997
+ aggregator = {
34998
+ type: "llm_judge",
34999
+ ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
35000
+ ...promptPath2 ? { promptPath: promptPath2 } : {}
35001
+ };
35002
+ }
35003
+ evaluators.push({
35004
+ name: name16,
35005
+ type: "composite",
35006
+ evaluators: memberEvaluators,
35007
+ aggregator
35008
+ });
35009
+ continue;
35010
+ }
34912
35011
  const prompt = asString2(rawEvaluator.prompt);
34913
35012
  let promptPath;
34914
35013
  if (prompt) {
@@ -34929,25 +35028,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34929
35028
  }
34930
35029
  }
34931
35030
  const _model = asString2(rawEvaluator.model);
35031
+ const rawRubrics = rawEvaluator.rubrics;
35032
+ const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
35033
+ id: asString2(rubric.id) ?? `rubric-${index + 1}`,
35034
+ description: asString2(rubric.description) ?? "",
35035
+ weight: typeof rubric.weight === "number" ? rubric.weight : 1,
35036
+ required: typeof rubric.required === "boolean" ? rubric.required : true
35037
+ })).filter((r) => r.description.length > 0) : void 0;
34932
35038
  if (typeValue === "rubric") {
34933
- const rubrics = rawEvaluator.rubrics;
34934
- if (!Array.isArray(rubrics)) {
35039
+ if (!parsedRubrics) {
34935
35040
  logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
34936
35041
  continue;
34937
35042
  }
34938
- const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
34939
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
34940
- description: asString2(rubric.description) ?? "",
34941
- weight: typeof rubric.weight === "number" ? rubric.weight : 1,
34942
- required: typeof rubric.required === "boolean" ? rubric.required : true
34943
- })).filter((r) => r.description.length > 0);
34944
35043
  if (parsedRubrics.length === 0) {
34945
35044
  logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
34946
35045
  continue;
34947
35046
  }
34948
35047
  evaluators.push({
34949
35048
  name: name16,
34950
- type: "rubric",
35049
+ type: "llm_judge",
34951
35050
  rubrics: parsedRubrics
34952
35051
  });
34953
35052
  continue;
@@ -34956,7 +35055,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
34956
35055
  name: name16,
34957
35056
  type: "llm_judge",
34958
35057
  prompt,
34959
- promptPath
35058
+ promptPath,
35059
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
34960
35060
  });
34961
35061
  }
34962
35062
  return evaluators.length > 0 ? evaluators : void 0;
@@ -35497,7 +35597,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
35497
35597
  if (rubricItems.length > 0) {
35498
35598
  const rubricEvaluator = {
35499
35599
  name: "rubric",
35500
- type: "rubric",
35600
+ type: "llm_judge",
35501
35601
  rubrics: rubricItems
35502
35602
  };
35503
35603
  evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
@@ -37330,144 +37430,6 @@ function createProvider(target) {
37330
37430
  }
37331
37431
  }
37332
37432
  }
37333
- var rubricCheckResultSchema = external_exports.object({
37334
- id: external_exports.string().describe("The ID of the rubric item being checked"),
37335
- satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
37336
- reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
37337
- });
37338
- var rubricEvaluationSchema = external_exports.object({
37339
- checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
37340
- overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
37341
- });
37342
- var RubricEvaluator = class {
37343
- kind = "rubric";
37344
- config;
37345
- resolveJudgeProvider;
37346
- constructor(options) {
37347
- this.config = options.config;
37348
- this.resolveJudgeProvider = options.resolveJudgeProvider;
37349
- }
37350
- async evaluate(context) {
37351
- const judgeProvider = await this.resolveJudgeProvider(context);
37352
- if (!judgeProvider) {
37353
- throw new Error("No judge provider available for rubric evaluation");
37354
- }
37355
- if (!this.config.rubrics || this.config.rubrics.length === 0) {
37356
- throw new Error(
37357
- `No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
37358
- );
37359
- }
37360
- const prompt = this.buildPrompt(context, this.config.rubrics);
37361
- const model = judgeProvider.asLanguageModel?.();
37362
- if (!model) {
37363
- throw new Error("Judge provider does not support language model interface");
37364
- }
37365
- const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
37366
- You must return a valid JSON object matching this schema:
37367
- {
37368
- "checks": [
37369
- {
37370
- "id": "string (rubric id)",
37371
- "satisfied": boolean,
37372
- "reasoning": "string (brief explanation)"
37373
- }
37374
- ],
37375
- "overall_reasoning": "string (summary)"
37376
- }`;
37377
- let result;
37378
- let lastError;
37379
- for (let attempt = 1; attempt <= 3; attempt++) {
37380
- try {
37381
- const { text: text2 } = await generateText({
37382
- model,
37383
- system,
37384
- prompt
37385
- });
37386
- const cleaned = text2.replace(/```json\n?|```/g, "").trim();
37387
- result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
37388
- break;
37389
- } catch (e) {
37390
- lastError = e instanceof Error ? e : new Error(String(e));
37391
- }
37392
- }
37393
- if (!result) {
37394
- throw new Error(
37395
- `Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
37396
- );
37397
- }
37398
- const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
37399
- return {
37400
- score,
37401
- verdict,
37402
- hits,
37403
- misses,
37404
- expectedAspectCount: this.config.rubrics.length,
37405
- reasoning: result.overall_reasoning,
37406
- evaluatorRawRequest: {
37407
- prompt
37408
- }
37409
- };
37410
- }
37411
- buildPrompt(context, rubrics) {
37412
- const parts = [
37413
- "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
37414
- "",
37415
- "[[ ## question ## ]]",
37416
- context.evalCase.question,
37417
- "",
37418
- "[[ ## expected_outcome ## ]]",
37419
- context.evalCase.expected_outcome,
37420
- ""
37421
- ];
37422
- if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
37423
- parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
37424
- }
37425
- parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
37426
- for (const rubric of rubrics) {
37427
- const requiredLabel = rubric.required ? " (REQUIRED)" : "";
37428
- const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
37429
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
37430
- }
37431
- parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
37432
- return parts.join("\n");
37433
- }
37434
- calculateScore(result, rubrics) {
37435
- const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
37436
- const hits = [];
37437
- const misses = [];
37438
- let totalWeight = 0;
37439
- let earnedWeight = 0;
37440
- let failedRequired = false;
37441
- for (const check2 of result.checks) {
37442
- const rubric = rubricMap.get(check2.id);
37443
- if (!rubric) {
37444
- continue;
37445
- }
37446
- totalWeight += rubric.weight;
37447
- if (check2.satisfied) {
37448
- earnedWeight += rubric.weight;
37449
- hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37450
- } else {
37451
- misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37452
- if (rubric.required) {
37453
- failedRequired = true;
37454
- }
37455
- }
37456
- }
37457
- const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
37458
- let verdict;
37459
- if (failedRequired) {
37460
- verdict = "fail";
37461
- } else if (score >= 0.8) {
37462
- verdict = "pass";
37463
- } else if (score >= 0.6) {
37464
- verdict = "borderline";
37465
- } else {
37466
- verdict = "fail";
37467
- }
37468
- return { score, verdict, hits, misses };
37469
- }
37470
- };
37471
37433
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
37472
37434
 
37473
37435
  Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -37485,6 +37447,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
37485
37447
 
37486
37448
  [[ ## candidate_answer ## ]]
37487
37449
  {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
37450
+ var freeformEvaluationSchema = external_exports.object({
37451
+ score: external_exports.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
37452
+ hits: external_exports.array(external_exports.string()).describe("Brief specific achievements").optional(),
37453
+ misses: external_exports.array(external_exports.string()).describe("Brief failures or omissions").optional(),
37454
+ reasoning: external_exports.string().describe("Concise explanation (1-2 sentences)").optional()
37455
+ });
37456
+ var rubricCheckResultSchema = external_exports.object({
37457
+ id: external_exports.string().describe("The ID of the rubric item being checked"),
37458
+ satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
37459
+ reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
37460
+ });
37461
+ var rubricEvaluationSchema = external_exports.object({
37462
+ checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
37463
+ overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
37464
+ });
37488
37465
  var LlmJudgeEvaluator = class {
37489
37466
  kind = "llm_judge";
37490
37467
  resolveJudgeProvider;
@@ -37502,9 +37479,13 @@ var LlmJudgeEvaluator = class {
37502
37479
  if (!judgeProvider) {
37503
37480
  throw new Error("No judge provider available for LLM grading");
37504
37481
  }
37505
- return this.evaluateWithPrompt(context, judgeProvider);
37482
+ const config2 = context.evaluator;
37483
+ if (config2?.type === "llm_judge" && config2.rubrics && config2.rubrics.length > 0) {
37484
+ return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
37485
+ }
37486
+ return this.evaluateFreeform(context, judgeProvider);
37506
37487
  }
37507
- async evaluateWithPrompt(context, judgeProvider) {
37488
+ async evaluateFreeform(context, judgeProvider) {
37508
37489
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
37509
37490
  const variables = {
37510
37491
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
@@ -37521,34 +37502,132 @@ var LlmJudgeEvaluator = class {
37521
37502
  const systemPrompt = buildOutputSchema();
37522
37503
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
37523
37504
  const userPrompt = substituteVariables(evaluatorTemplate, variables);
37524
- const response = await judgeProvider.invoke({
37525
- question: userPrompt,
37526
- systemPrompt,
37527
- evalCaseId: context.evalCase.id,
37528
- attempt: context.attempt,
37529
- maxOutputTokens: this.maxOutputTokens,
37530
- temperature: this.temperature
37531
- });
37532
- const parsed = parseQualityResponse(response);
37533
- const score = clampScore(parsed.score ?? 0);
37534
- const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
37535
- const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
37536
- const reasoning = parsed.reasoning ?? response.reasoning;
37537
- const expectedAspectCount = Math.max(hits.length + misses.length, 1);
37538
37505
  const evaluatorRawRequest = {
37539
37506
  userPrompt,
37540
37507
  systemPrompt,
37541
37508
  target: judgeProvider.targetName
37542
37509
  };
37510
+ try {
37511
+ const { data, providerResponse } = await this.runWithRetry({
37512
+ context,
37513
+ judgeProvider,
37514
+ systemPrompt,
37515
+ userPrompt,
37516
+ schema: freeformEvaluationSchema
37517
+ });
37518
+ const score = clampScore(data.score);
37519
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
37520
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
37521
+ const reasoning = data.reasoning ?? providerResponse?.reasoning;
37522
+ const expectedAspectCount = Math.max(hits.length + misses.length, 1);
37523
+ return {
37524
+ score,
37525
+ verdict: scoreToVerdict(score),
37526
+ hits,
37527
+ misses,
37528
+ expectedAspectCount,
37529
+ reasoning,
37530
+ evaluatorRawRequest
37531
+ };
37532
+ } catch {
37533
+ return {
37534
+ score: 0,
37535
+ verdict: "fail",
37536
+ hits: [],
37537
+ misses: [],
37538
+ expectedAspectCount: 1,
37539
+ evaluatorRawRequest
37540
+ };
37541
+ }
37542
+ }
37543
+ async evaluateWithRubrics(context, judgeProvider, rubrics) {
37544
+ if (!rubrics || rubrics.length === 0) {
37545
+ throw new Error(
37546
+ `No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
37547
+ );
37548
+ }
37549
+ const prompt = this.buildRubricPrompt(context, rubrics);
37550
+ const systemPrompt = buildRubricOutputSchema();
37551
+ const evaluatorRawRequest = {
37552
+ userPrompt: prompt,
37553
+ systemPrompt,
37554
+ target: judgeProvider.targetName
37555
+ };
37556
+ const { data } = await this.runWithRetry({
37557
+ context,
37558
+ judgeProvider,
37559
+ systemPrompt,
37560
+ userPrompt: prompt,
37561
+ schema: rubricEvaluationSchema
37562
+ });
37563
+ const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
37543
37564
  return {
37544
37565
  score,
37566
+ verdict,
37545
37567
  hits,
37546
37568
  misses,
37547
- expectedAspectCount,
37548
- reasoning,
37569
+ expectedAspectCount: rubrics.length,
37570
+ reasoning: data.overall_reasoning,
37549
37571
  evaluatorRawRequest
37550
37572
  };
37551
37573
  }
37574
+ buildRubricPrompt(context, rubrics) {
37575
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
37576
+ const parts = [
37577
+ "You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
37578
+ "",
37579
+ "[[ ## question ## ]]",
37580
+ formattedQuestion,
37581
+ "",
37582
+ "[[ ## expected_outcome ## ]]",
37583
+ context.evalCase.expected_outcome,
37584
+ ""
37585
+ ];
37586
+ if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
37587
+ parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
37588
+ }
37589
+ parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
37590
+ for (const rubric of rubrics) {
37591
+ const requiredLabel = rubric.required ? " (REQUIRED)" : "";
37592
+ const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
37593
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
37594
+ }
37595
+ parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
37596
+ return parts.join("\n");
37597
+ }
37598
+ async runWithRetry(options) {
37599
+ const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
37600
+ let lastError;
37601
+ for (let attempt = 1; attempt <= 3; attempt++) {
37602
+ try {
37603
+ const model = judgeProvider.asLanguageModel?.();
37604
+ if (model) {
37605
+ const { text: text2 } = await generateText({
37606
+ model,
37607
+ system: systemPrompt,
37608
+ prompt: userPrompt,
37609
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
37610
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
37611
+ });
37612
+ const data2 = schema.parse(parseJsonFromText(text2));
37613
+ return { data: data2 };
37614
+ }
37615
+ const response = await judgeProvider.invoke({
37616
+ question: userPrompt,
37617
+ systemPrompt,
37618
+ evalCaseId: context.evalCase.id,
37619
+ attempt: context.attempt,
37620
+ maxOutputTokens: this.maxOutputTokens,
37621
+ temperature: this.temperature
37622
+ });
37623
+ const data = schema.parse(parseJsonFromText(response.text ?? ""));
37624
+ return { data, providerResponse: response };
37625
+ } catch (e) {
37626
+ lastError = e instanceof Error ? e : new Error(String(e));
37627
+ }
37628
+ }
37629
+ throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
37630
+ }
37552
37631
  };
37553
37632
  function buildOutputSchema() {
37554
37633
  return [
@@ -37562,6 +37641,29 @@ function buildOutputSchema() {
37562
37641
  "}"
37563
37642
  ].join("\n");
37564
37643
  }
37644
+ function buildRubricOutputSchema() {
37645
+ return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
37646
+ You must return a valid JSON object matching this schema:
37647
+ {
37648
+ "checks": [
37649
+ {
37650
+ "id": "string (rubric id)",
37651
+ "satisfied": boolean,
37652
+ "reasoning": "string (brief explanation)"
37653
+ }
37654
+ ],
37655
+ "overall_reasoning": "string (summary)"
37656
+ }`;
37657
+ }
37658
+ function scoreToVerdict(score) {
37659
+ if (score >= 0.8) {
37660
+ return "pass";
37661
+ }
37662
+ if (score >= 0.6) {
37663
+ return "borderline";
37664
+ }
37665
+ return "fail";
37666
+ }
37565
37667
  function clampScore(value) {
37566
37668
  if (Number.isNaN(value) || !Number.isFinite(value)) {
37567
37669
  return 0;
@@ -37574,71 +37676,15 @@ function clampScore(value) {
37574
37676
  }
37575
37677
  return value;
37576
37678
  }
37577
- function parseQualityResponse(response) {
37578
- const text2 = typeof response.text === "string" ? response.text.trim() : "";
37579
- if (text2.length === 0) {
37580
- return {};
37581
- }
37582
- const direct = attemptParseJson(text2);
37583
- if (direct && validateQualityJson(direct)) {
37584
- return direct;
37585
- }
37586
- const extracted = extractJsonBlob(text2);
37587
- if (extracted) {
37588
- const parsed = attemptParseJson(extracted);
37589
- if (parsed && validateQualityJson(parsed)) {
37590
- return parsed;
37591
- }
37592
- }
37593
- return {};
37594
- }
37595
- function attemptParseJson(text2) {
37596
- try {
37597
- const parsed = JSON.parse(text2);
37598
- const score = typeof parsed.score === "number" ? parsed.score : void 0;
37599
- const hits = parsed.hits;
37600
- const misses = parsed.misses;
37601
- const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
37602
- return { score, hits, misses, reasoning };
37603
- } catch {
37604
- return void 0;
37605
- }
37606
- }
37607
- function validateQualityJson(parsed) {
37608
- if (typeof parsed.score !== "number") {
37609
- return false;
37610
- }
37611
- if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
37612
- return false;
37613
- }
37614
- if (parsed.score < 0 || parsed.score > 1) {
37615
- return false;
37616
- }
37617
- if (parsed.hits !== void 0) {
37618
- if (!Array.isArray(parsed.hits)) {
37619
- return false;
37620
- }
37621
- if (!parsed.hits.every((item) => typeof item === "string")) {
37622
- return false;
37623
- }
37624
- }
37625
- if (parsed.misses !== void 0) {
37626
- if (!Array.isArray(parsed.misses)) {
37627
- return false;
37628
- }
37629
- if (!parsed.misses.every((item) => typeof item === "string")) {
37630
- return false;
37631
- }
37632
- }
37633
- if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
37634
- return false;
37635
- }
37636
- return true;
37637
- }
37638
37679
  function extractJsonBlob(text2) {
37639
37680
  const match = text2.match(/\{[\s\S]*\}/);
37640
37681
  return match?.[0];
37641
37682
  }
37683
+ function parseJsonFromText(text2) {
37684
+ const cleaned = typeof text2 === "string" ? text2.replace(/```json\n?|```/g, "").trim() : "";
37685
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
37686
+ return JSON.parse(blob);
37687
+ }
37642
37688
  function isNonEmptyString(value) {
37643
37689
  return typeof value === "string" && value.trim().length > 0;
37644
37690
  }
@@ -37675,6 +37721,7 @@ var CodeEvaluator = class {
37675
37721
  const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
37676
37722
  return {
37677
37723
  score,
37724
+ verdict: scoreToVerdict(score),
37678
37725
  hits,
37679
37726
  misses,
37680
37727
  expectedAspectCount: hits.length + misses.length || 1,
@@ -37688,6 +37735,7 @@ var CodeEvaluator = class {
37688
37735
  const message = error40 instanceof Error ? error40.message : String(error40);
37689
37736
  return {
37690
37737
  score: 0,
37738
+ verdict: "fail",
37691
37739
  hits: [],
37692
37740
  misses: [`Code evaluator failed: ${message}`],
37693
37741
  expectedAspectCount: 1,
@@ -37701,6 +37749,33 @@ var CodeEvaluator = class {
37701
37749
  }
37702
37750
  }
37703
37751
  };
37752
+ function calculateRubricScore(result, rubrics) {
37753
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
37754
+ const hits = [];
37755
+ const misses = [];
37756
+ let totalWeight = 0;
37757
+ let earnedWeight = 0;
37758
+ let failedRequired = false;
37759
+ for (const check2 of result.checks) {
37760
+ const rubric = rubricMap.get(check2.id);
37761
+ if (!rubric) {
37762
+ continue;
37763
+ }
37764
+ totalWeight += rubric.weight;
37765
+ if (check2.satisfied) {
37766
+ earnedWeight += rubric.weight;
37767
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37768
+ } else {
37769
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
37770
+ if (rubric.required) {
37771
+ failedRequired = true;
37772
+ }
37773
+ }
37774
+ }
37775
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
37776
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
37777
+ return { score, verdict, hits, misses };
37778
+ }
37704
37779
  async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
37705
37780
  const { spawn: spawn22 } = await import("node:child_process");
37706
37781
  return await new Promise((resolve2, reject) => {
@@ -37752,6 +37827,228 @@ function substituteVariables(template, variables) {
37752
37827
  return variables[varName] ?? match;
37753
37828
  });
37754
37829
  }
37830
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
37831
+ {{EVALUATOR_RESULTS_JSON}}
37832
+
37833
+ Decide the final score and verdict based on all evaluator results.
37834
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
37835
+ var CompositeEvaluator = class {
37836
+ kind = "composite";
37837
+ config;
37838
+ evaluatorFactory;
37839
+ cwd;
37840
+ constructor(options) {
37841
+ this.config = options.config;
37842
+ this.evaluatorFactory = options.evaluatorFactory;
37843
+ this.cwd = options.cwd;
37844
+ }
37845
+ async evaluate(context) {
37846
+ const memberResults = await Promise.all(
37847
+ this.config.evaluators.map(async (memberConfig) => {
37848
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
37849
+ return {
37850
+ id: memberConfig.name,
37851
+ type: memberConfig.type,
37852
+ result: await evaluator.evaluate(context)
37853
+ };
37854
+ })
37855
+ );
37856
+ return this.aggregate(memberResults, context);
37857
+ }
37858
+ async aggregate(results, context) {
37859
+ const aggregator = this.config.aggregator;
37860
+ switch (aggregator.type) {
37861
+ case "code_judge":
37862
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
37863
+ case "llm_judge":
37864
+ return this.runLlmAggregator(results, context, aggregator);
37865
+ default:
37866
+ return this.runWeightedAverage(results, aggregator.weights);
37867
+ }
37868
+ }
37869
+ runWeightedAverage(results, weights) {
37870
+ let totalWeight = 0;
37871
+ let weightedSum = 0;
37872
+ const allHits = [];
37873
+ const allMisses = [];
37874
+ const reasoningParts = [];
37875
+ const evaluatorResults = [];
37876
+ for (const member of results) {
37877
+ const weight = weights?.[member.id] ?? 1;
37878
+ totalWeight += weight;
37879
+ weightedSum += member.result.score * weight;
37880
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
37881
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
37882
+ if (member.result.reasoning) {
37883
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
37884
+ }
37885
+ evaluatorResults.push({
37886
+ name: member.id,
37887
+ type: member.type,
37888
+ score: member.result.score,
37889
+ weight,
37890
+ verdict: member.result.verdict,
37891
+ hits: [...member.result.hits],
37892
+ misses: [...member.result.misses],
37893
+ reasoning: member.result.reasoning,
37894
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
37895
+ evaluatorResults: member.result.evaluatorResults
37896
+ });
37897
+ }
37898
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
37899
+ return {
37900
+ score: clampScore(finalScore),
37901
+ verdict: scoreToVerdict(finalScore),
37902
+ hits: allHits,
37903
+ misses: allMisses,
37904
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
37905
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
37906
+ evaluatorRawRequest: {
37907
+ aggregator: "weighted_average",
37908
+ ...weights ? { weights } : {}
37909
+ },
37910
+ evaluatorResults
37911
+ };
37912
+ }
37913
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
37914
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
37915
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
37916
+ const evaluatorResults = results.map((member) => ({
37917
+ name: member.id,
37918
+ type: member.type,
37919
+ score: member.result.score,
37920
+ weight: weights?.[member.id] ?? 1,
37921
+ verdict: member.result.verdict,
37922
+ hits: [...member.result.hits],
37923
+ misses: [...member.result.misses],
37924
+ reasoning: member.result.reasoning,
37925
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
37926
+ evaluatorResults: member.result.evaluatorResults
37927
+ }));
37928
+ try {
37929
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
37930
+ const parsed = parseJsonSafe(stdout);
37931
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
37932
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
37933
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
37934
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
37935
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
37936
+ return {
37937
+ score,
37938
+ verdict,
37939
+ hits,
37940
+ misses,
37941
+ expectedAspectCount: hits.length + misses.length || 1,
37942
+ reasoning,
37943
+ evaluatorRawRequest: {
37944
+ aggregator: "code_judge",
37945
+ script: scriptPath
37946
+ },
37947
+ evaluatorResults
37948
+ };
37949
+ } catch (error40) {
37950
+ const message = error40 instanceof Error ? error40.message : String(error40);
37951
+ return {
37952
+ score: 0,
37953
+ verdict: "fail",
37954
+ hits: [],
37955
+ misses: [`Code aggregator failed: ${message}`],
37956
+ expectedAspectCount: 1,
37957
+ reasoning: message,
37958
+ evaluatorRawRequest: {
37959
+ aggregator: "code_judge",
37960
+ script: scriptPath,
37961
+ error: message
37962
+ },
37963
+ evaluatorResults
37964
+ };
37965
+ }
37966
+ }
37967
+ async runLlmAggregator(results, context, config2) {
37968
+ const judgeProvider = context.judgeProvider;
37969
+ if (!judgeProvider) {
37970
+ throw new Error("No judge provider available for LLM aggregation");
37971
+ }
37972
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
37973
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
37974
+ const evaluatorResults = results.map((member) => ({
37975
+ name: member.id,
37976
+ type: member.type,
37977
+ score: member.result.score,
37978
+ verdict: member.result.verdict,
37979
+ hits: [...member.result.hits],
37980
+ misses: [...member.result.misses],
37981
+ reasoning: member.result.reasoning,
37982
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
37983
+ evaluatorResults: member.result.evaluatorResults
37984
+ }));
37985
+ const promptTemplate = config2.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
37986
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
37987
+ const systemPrompt = buildOutputSchema();
37988
+ const evaluatorRawRequest = {
37989
+ aggregator: "llm_judge",
37990
+ userPrompt,
37991
+ systemPrompt,
37992
+ target: judgeProvider.targetName
37993
+ };
37994
+ try {
37995
+ const model = judgeProvider.asLanguageModel?.();
37996
+ if (model) {
37997
+ const { text: text2 } = await generateText({
37998
+ model,
37999
+ system: systemPrompt,
38000
+ prompt: userPrompt
38001
+ });
38002
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text2));
38003
+ const score2 = clampScore(data2.score);
38004
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
38005
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
38006
+ const reasoning2 = data2.reasoning;
38007
+ return {
38008
+ score: score2,
38009
+ verdict: scoreToVerdict(score2),
38010
+ hits: hits2,
38011
+ misses: misses2,
38012
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
38013
+ reasoning: reasoning2,
38014
+ evaluatorRawRequest,
38015
+ evaluatorResults
38016
+ };
38017
+ }
38018
+ const response = await judgeProvider.invoke({
38019
+ question: userPrompt,
38020
+ systemPrompt,
38021
+ evalCaseId: context.evalCase.id,
38022
+ attempt: context.attempt
38023
+ });
38024
+ const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
38025
+ const score = clampScore(data.score);
38026
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
38027
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
38028
+ const reasoning = data.reasoning ?? response.reasoning;
38029
+ return {
38030
+ score,
38031
+ verdict: scoreToVerdict(score),
38032
+ hits,
38033
+ misses,
38034
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
38035
+ reasoning,
38036
+ evaluatorRawRequest,
38037
+ evaluatorResults
38038
+ };
38039
+ } catch {
38040
+ return {
38041
+ score: 0,
38042
+ verdict: "fail",
38043
+ hits: [],
38044
+ misses: [],
38045
+ expectedAspectCount: 1,
38046
+ evaluatorRawRequest,
38047
+ evaluatorResults
38048
+ };
38049
+ }
38050
+ }
38051
+ };
37755
38052
  var Node = class {
37756
38053
  value;
37757
38054
  next;
@@ -38426,7 +38723,6 @@ async function runEvaluatorList(options) {
38426
38723
  reasoning: score2.reasoning,
38427
38724
  evaluator_provider_request: score2.evaluatorRawRequest
38428
38725
  });
38429
- continue;
38430
38726
  }
38431
38727
  if (evaluator.type === "code") {
38432
38728
  const codeEvaluator = new CodeEvaluator({
@@ -38443,10 +38739,10 @@ async function runEvaluatorList(options) {
38443
38739
  promptInputs,
38444
38740
  now
38445
38741
  });
38446
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
38742
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
38447
38743
  evaluatorResults.push({
38448
38744
  name: evaluator.name,
38449
- type: evaluator.type,
38745
+ type: "code_judge",
38450
38746
  score: score2.score,
38451
38747
  verdict: score2.verdict,
38452
38748
  hits: score2.hits,
@@ -38454,19 +38750,37 @@ async function runEvaluatorList(options) {
38454
38750
  reasoning: score2.reasoning,
38455
38751
  evaluator_provider_request: score2.evaluatorRawRequest
38456
38752
  });
38457
- continue;
38458
38753
  }
38459
- if (evaluator.type === "rubric") {
38460
- const rubricEvaluator = new RubricEvaluator({
38461
- config: evaluator,
38462
- resolveJudgeProvider: async (context) => {
38463
- if (context.judgeProvider) {
38464
- return context.judgeProvider;
38754
+ if (evaluator.type === "composite") {
38755
+ const evalFileDir = evalCase.guideline_paths[0] ? path122.dirname(evalCase.guideline_paths[0]) : process.cwd();
38756
+ const createEvaluator = (memberConfig) => {
38757
+ switch (memberConfig.type) {
38758
+ case "llm_judge":
38759
+ return evaluatorRegistry.llm_judge;
38760
+ case "code":
38761
+ return new CodeEvaluator({
38762
+ script: memberConfig.script,
38763
+ cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
38764
+ agentTimeoutMs
38765
+ });
38766
+ case "composite":
38767
+ return new CompositeEvaluator({
38768
+ config: memberConfig,
38769
+ cwd: evalFileDir,
38770
+ evaluatorFactory: { create: createEvaluator }
38771
+ });
38772
+ default: {
38773
+ const unknownConfig = memberConfig;
38774
+ throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
38465
38775
  }
38466
- return judgeProvider;
38467
38776
  }
38777
+ };
38778
+ const compositeEvaluator = new CompositeEvaluator({
38779
+ config: evaluator,
38780
+ cwd: evalFileDir,
38781
+ evaluatorFactory: { create: createEvaluator }
38468
38782
  });
38469
- const score2 = await rubricEvaluator.evaluate({
38783
+ const score2 = await compositeEvaluator.evaluate({
38470
38784
  evalCase,
38471
38785
  candidate,
38472
38786
  target,
@@ -38485,27 +38799,31 @@ async function runEvaluatorList(options) {
38485
38799
  hits: score2.hits,
38486
38800
  misses: score2.misses,
38487
38801
  reasoning: score2.reasoning,
38488
- evaluator_provider_request: score2.evaluatorRawRequest
38802
+ evaluator_provider_request: score2.evaluatorRawRequest,
38803
+ evaluator_results: mapChildResults(score2.evaluatorResults)
38489
38804
  });
38490
38805
  }
38491
38806
  } catch (error40) {
38492
38807
  const message = error40 instanceof Error ? error40.message : String(error40);
38493
38808
  const fallbackScore = {
38494
38809
  score: 0,
38810
+ verdict: "fail",
38495
38811
  hits: [],
38496
38812
  misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
38497
38813
  expectedAspectCount: 1,
38498
38814
  reasoning: message
38499
38815
  };
38816
+ const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
38500
38817
  scored.push({
38501
38818
  score: fallbackScore,
38502
38819
  name: evaluator.name ?? "unknown",
38503
- type: evaluator.type ?? "unknown"
38820
+ type: resultType ?? "llm_judge"
38504
38821
  });
38505
38822
  evaluatorResults.push({
38506
38823
  name: evaluator.name ?? "unknown",
38507
- type: evaluator.type ?? "unknown",
38824
+ type: resultType ?? "llm_judge",
38508
38825
  score: 0,
38826
+ verdict: "fail",
38509
38827
  hits: [],
38510
38828
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
38511
38829
  reasoning: message
@@ -38524,6 +38842,7 @@ async function runEvaluatorList(options) {
38524
38842
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
38525
38843
  const score = {
38526
38844
  score: aggregateScore,
38845
+ verdict: scoreToVerdict2(aggregateScore),
38527
38846
  hits,
38528
38847
  misses,
38529
38848
  expectedAspectCount,
@@ -38574,6 +38893,15 @@ async function resolveCustomPrompt(config2) {
38574
38893
  function isNonEmptyString2(value) {
38575
38894
  return typeof value === "string" && value.trim().length > 0;
38576
38895
  }
38896
+ function scoreToVerdict2(score) {
38897
+ if (score >= 0.8) {
38898
+ return "pass";
38899
+ }
38900
+ if (score >= 0.6) {
38901
+ return "borderline";
38902
+ }
38903
+ return "fail";
38904
+ }
38577
38905
  function filterEvalCases(evalCases, evalId) {
38578
38906
  if (!evalId) {
38579
38907
  return evalCases;
@@ -38711,6 +39039,23 @@ function isTimeoutLike(error40) {
38711
39039
  const value = String(error40).toLowerCase();
38712
39040
  return value.includes("timeout");
38713
39041
  }
39042
+ function mapChildResults(children) {
39043
+ if (!children || children.length === 0) {
39044
+ return void 0;
39045
+ }
39046
+ return children.map((child) => ({
39047
+ name: child.name,
39048
+ type: child.type,
39049
+ score: child.score,
39050
+ weight: child.weight,
39051
+ verdict: child.verdict,
39052
+ hits: child.hits,
39053
+ misses: child.misses,
39054
+ reasoning: child.reasoning,
39055
+ evaluator_provider_request: child.evaluatorRawRequest,
39056
+ evaluator_results: mapChildResults(child.evaluatorResults)
39057
+ }));
39058
+ }
38714
39059
  var rubricItemSchema = external_exports.object({
38715
39060
  id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
38716
39061
  description: external_exports.string().describe("What this rubric checks for"),
@@ -40964,8 +41309,8 @@ var evalCommand = command({
40964
41309
  workers: option({
40965
41310
  type: number4,
40966
41311
  long: "workers",
40967
- description: "Number of parallel workers (default: 1, max: 50). Can also be set per-target in targets.yaml",
40968
- defaultValue: () => 1
41312
+ description: "Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml",
41313
+ defaultValue: () => 3
40969
41314
  }),
40970
41315
  out: option({
40971
41316
  type: optional2(string4),
@@ -41713,4 +42058,4 @@ export {
41713
42058
  app,
41714
42059
  runCli
41715
42060
  };
41716
- //# sourceMappingURL=chunk-WOCXZEH4.js.map
42061
+ //# sourceMappingURL=chunk-QRY42RAP.js.map