cclaw-cli 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Loader + validator for `.cclaw/evals/rubrics/<stage>.yaml`.
3
+ *
4
+ * Each file maps to exactly one `RubricDoc` that drives the LLM judge.
5
+ * Validation is strict: unknown top-level keys, missing required fields,
6
+ * duplicate check ids, and malformed weights all surface as actionable
7
+ * errors rather than turning into silent "judge had nothing to score"
8
+ * passes.
9
+ */
10
+ import fs from "node:fs/promises";
11
+ import path from "node:path";
12
+ import { parse } from "yaml";
13
+ import { EVALS_ROOT } from "../constants.js";
14
+ import { exists } from "../fs-utils.js";
15
+ import { FLOW_STAGES } from "../types.js";
16
+ export function rubricsDir(projectRoot) {
17
+ return path.join(projectRoot, EVALS_ROOT, "rubrics");
18
+ }
19
+ export function rubricPath(projectRoot, stage) {
20
+ return path.join(rubricsDir(projectRoot), `${stage}.yaml`);
21
+ }
22
+ function rubricError(file, reason) {
23
+ return new Error(`Invalid rubric at ${file}: ${reason}\n` +
24
+ `See docs/evals.md for the rubric schema. Fields: stage (required), id (optional, defaults to stage), checks[] with id + prompt.`);
25
+ }
26
+ function isRecord(value) {
27
+ return typeof value === "object" && value !== null && !Array.isArray(value);
28
+ }
29
+ function validateCheck(raw, index, file) {
30
+ if (!isRecord(raw)) {
31
+ throw rubricError(file, `checks[${index}] must be a mapping`);
32
+ }
33
+ const id = raw.id;
34
+ if (typeof id !== "string" || id.trim().length === 0) {
35
+ throw rubricError(file, `checks[${index}].id must be a non-empty string`);
36
+ }
37
+ if (!/^[a-z][a-z0-9-]*$/.test(id)) {
38
+ throw rubricError(file, `checks[${index}].id "${id}" must be kebab-case (lowercase letters, digits, hyphen; starts with a letter)`);
39
+ }
40
+ const prompt = raw.prompt;
41
+ if (typeof prompt !== "string" || prompt.trim().length === 0) {
42
+ throw rubricError(file, `checks[${index}].prompt must be a non-empty string`);
43
+ }
44
+ const check = {
45
+ id,
46
+ prompt: prompt.trim()
47
+ };
48
+ if (raw.scale !== undefined) {
49
+ if (typeof raw.scale !== "string" || raw.scale.trim().length === 0) {
50
+ throw rubricError(file, `checks[${index}].scale must be a non-empty string when provided`);
51
+ }
52
+ check.scale = raw.scale.trim();
53
+ }
54
+ if (raw.weight !== undefined) {
55
+ if (typeof raw.weight !== "number" || !Number.isFinite(raw.weight) || raw.weight < 0) {
56
+ throw rubricError(file, `checks[${index}].weight must be a non-negative number when provided`);
57
+ }
58
+ check.weight = raw.weight;
59
+ }
60
+ if (raw.critical !== undefined) {
61
+ if (typeof raw.critical !== "boolean") {
62
+ throw rubricError(file, `checks[${index}].critical must be a boolean when provided`);
63
+ }
64
+ check.critical = raw.critical;
65
+ }
66
+ const known = new Set(["id", "prompt", "scale", "weight", "critical"]);
67
+ const unknown = Object.keys(raw).filter((key) => !known.has(key));
68
+ if (unknown.length > 0) {
69
+ throw rubricError(file, `checks[${index}] has unknown key(s): ${unknown.join(", ")}`);
70
+ }
71
+ return check;
72
+ }
73
+ function validateRubric(raw, file) {
74
+ if (!isRecord(raw)) {
75
+ throw rubricError(file, "top-level value must be a mapping");
76
+ }
77
+ const stage = raw.stage;
78
+ if (typeof stage !== "string" || !FLOW_STAGES.includes(stage)) {
79
+ throw rubricError(file, `"stage" must be one of: ${FLOW_STAGES.join(", ")} (got: ${JSON.stringify(stage)})`);
80
+ }
81
+ const id = raw.id;
82
+ let rubricId = stage;
83
+ if (id !== undefined) {
84
+ if (typeof id !== "string" || id.trim().length === 0) {
85
+ throw rubricError(file, `"id" must be a non-empty string when provided`);
86
+ }
87
+ rubricId = id.trim();
88
+ }
89
+ const checks = raw.checks;
90
+ if (!Array.isArray(checks) || checks.length === 0) {
91
+ throw rubricError(file, `"checks" must be a non-empty array`);
92
+ }
93
+ const parsed = [];
94
+ const seen = new Set();
95
+ for (let i = 0; i < checks.length; i += 1) {
96
+ const check = validateCheck(checks[i], i, file);
97
+ if (seen.has(check.id)) {
98
+ throw rubricError(file, `duplicate check id: "${check.id}"`);
99
+ }
100
+ seen.add(check.id);
101
+ parsed.push(check);
102
+ }
103
+ const known = new Set(["stage", "id", "checks"]);
104
+ const unknown = Object.keys(raw).filter((key) => !known.has(key));
105
+ if (unknown.length > 0) {
106
+ throw rubricError(file, `unknown top-level key(s): ${unknown.join(", ")}`);
107
+ }
108
+ return {
109
+ stage: stage,
110
+ id: rubricId,
111
+ checks: parsed
112
+ };
113
+ }
114
+ /**
115
+ * Load the rubric for `stage`. Returns `undefined` when the file is
116
+ * missing so callers can emit a "no rubric" verifier result rather than
117
+ * crashing — authors are expected to grow rubrics incrementally.
118
+ */
119
+ export async function loadRubric(projectRoot, stage) {
120
+ const file = rubricPath(projectRoot, stage);
121
+ if (!(await exists(file)))
122
+ return undefined;
123
+ let parsed;
124
+ try {
125
+ parsed = parse(await fs.readFile(file, "utf8"));
126
+ }
127
+ catch (err) {
128
+ throw rubricError(file, err instanceof Error ? err.message : String(err));
129
+ }
130
+ return validateRubric(parsed, file);
131
+ }
132
+ /** Load every rubric present in the given rubrics directory. */
133
+ export async function loadAllRubrics(projectRoot) {
134
+ const out = new Map();
135
+ for (const stage of FLOW_STAGES) {
136
+ const doc = await loadRubric(projectRoot, stage);
137
+ if (doc)
138
+ out.set(stage, doc);
139
+ }
140
+ return out;
141
+ }
142
+ /** Exposed for tests. */
143
+ export const __internal = { validateRubric, validateCheck };
@@ -1,4 +1,5 @@
1
1
  import type { FlowStage } from "../types.js";
2
+ import { type EvalLlmClient } from "./llm-client.js";
2
3
  import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
3
4
  export interface RunEvalOptions {
4
5
  projectRoot: string;
@@ -14,6 +15,12 @@ export interface RunEvalOptions {
14
15
  dryRun?: boolean;
15
16
  /** Override process.env during tests. */
16
17
  env?: NodeJS.ProcessEnv;
18
+ /**
19
+ * Optional LLM client injection. Primary use case: unit and
20
+ * integration tests that want deterministic judge + agent behavior
21
+ * without hitting the network.
22
+ */
23
+ llmClient?: EvalLlmClient;
17
24
  }
18
25
  export interface DryRunSummary {
19
26
  kind: "dry-run";
@@ -1,9 +1,14 @@
1
1
  import { randomUUID } from "node:crypto";
2
2
  import { CCLAW_VERSION } from "../constants.js";
3
3
  import { FLOW_STAGES } from "../types.js";
4
+ import { runSingleShot } from "./agents/single-shot.js";
4
5
  import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
5
6
  import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
6
7
  import { loadEvalConfig } from "./config-loader.js";
8
+ import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
9
+ import { createEvalClient, EvalLlmError } from "./llm-client.js";
10
+ import { loadAllRubrics } from "./rubric-loader.js";
11
+ import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
7
12
  import { verifyRules } from "./verifiers/rules.js";
8
13
  import { verifyStructural } from "./verifiers/structural.js";
9
14
  import { verifyTraceability } from "./verifiers/traceability.js";
@@ -26,16 +31,38 @@ function skeletonVerifierResult(message, details) {
26
31
  /**
27
32
  * --schema-only narrows to structural. --rules opens up rules + traceability
28
33
  * on top of structural (traceability is a rule-family verifier even though
29
- * it lives in its own module). Default (no flag) matches --schema-only for
30
- * backwards compatibility with the Step 1 gate.
34
+ * it lives in its own module). --judge opens up the LLM judge and, for
35
+ * Tier A, the single-shot agent-under-test. --schema-only always wins so
36
+ * the LLM-free PR gate never pays for tokens even if stale flags collide.
31
37
  */
32
38
  function resolveRunFlags(options) {
33
39
  const rulesRequested = options.rules === true;
34
40
  const schemaOnly = options.schemaOnly === true;
41
+ const judgeRequested = options.judge === true;
42
+ const runJudge = judgeRequested && !schemaOnly;
43
+ const runAgent = runJudge && (options.tier ?? "A") === "A";
35
44
  return {
36
45
  runStructural: true,
37
46
  runRules: rulesRequested && !schemaOnly,
38
- runTraceability: rulesRequested && !schemaOnly
47
+ runTraceability: rulesRequested && !schemaOnly,
48
+ runJudge,
49
+ runAgent
50
+ };
51
+ }
52
+ /**
53
+ * Wrap a client so every chat() result is accounted against the cost
54
+ * guard before being returned. The guard throws
55
+ * DailyCostCapExceededError if committing the call would cross the
56
+ * configured cap — the runner surfaces that as a hard failure so
57
+ * nightly CI fails loud instead of silently overspending.
58
+ */
59
+ function wrapClientWithCostGuard(client, costGuard, fallbackModel) {
60
+ return {
61
+ async chat(request) {
62
+ const response = await client.chat(request);
63
+ await costGuard.commit(response.model || fallbackModel, response.usage);
64
+ return response;
65
+ }
39
66
  };
40
67
  }
41
68
  async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
@@ -54,17 +81,61 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
54
81
  return undefined;
55
82
  }
56
83
  }
57
- async function runCase(projectRoot, caseEntry, plannedTier, flags) {
84
+ async function runCase(ctx) {
85
+ const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
58
86
  const started = Date.now();
59
87
  const verifierResults = [];
60
88
  const expected = caseEntry.expected;
89
+ let caseCostUsd = 0;
61
90
  const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
62
91
  const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
63
92
  const hasTraceability = flags.runTraceability && !!expected?.traceability;
64
- const needsArtifact = hasStructural || hasRules || hasTraceability;
93
+ const judgeRequested = flags.runJudge && !!expected?.judge;
94
+ const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
65
95
  let artifact;
66
96
  if (needsArtifact) {
67
- artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
97
+ if (flags.runAgent && judgeRequested && client) {
98
+ try {
99
+ const produced = await runSingleShot({
100
+ caseEntry,
101
+ config,
102
+ projectRoot,
103
+ client
104
+ });
105
+ artifact = produced.artifact;
106
+ caseCostUsd += produced.usageUsd;
107
+ verifierResults.push({
108
+ kind: "workflow",
109
+ id: "agent:single-shot",
110
+ ok: true,
111
+ score: 1,
112
+ message: `single-shot agent produced ${produced.artifact.length} char(s) in ${produced.durationMs}ms`,
113
+ details: {
114
+ model: produced.model,
115
+ tokensIn: produced.usage.promptTokens,
116
+ tokensOut: produced.usage.completionTokens,
117
+ usageUsd: produced.usageUsd,
118
+ attempts: produced.attempts
119
+ }
120
+ });
121
+ }
122
+ catch (err) {
123
+ if (err instanceof DailyCostCapExceededError)
124
+ throw err;
125
+ const retryable = err instanceof EvalLlmError ? err.retryable : false;
126
+ verifierResults.push({
127
+ kind: "workflow",
128
+ id: "agent:single-shot",
129
+ ok: false,
130
+ score: 0,
131
+ message: err instanceof Error ? err.message : String(err),
132
+ details: { retryable }
133
+ });
134
+ }
135
+ }
136
+ else {
137
+ artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
138
+ }
68
139
  if (artifact === undefined && verifierResults.length === 0) {
69
140
  verifierResults.push({
70
141
  kind: "structural",
@@ -111,6 +182,46 @@ async function runCase(projectRoot, caseEntry, plannedTier, flags) {
111
182
  });
112
183
  }
113
184
  }
185
+ if (judgeRequested && artifact !== undefined && client) {
186
+ const rubric = rubrics.get(caseEntry.stage);
187
+ if (!rubric) {
188
+ verifierResults.push({
189
+ kind: "judge",
190
+ id: "judge:rubric:missing",
191
+ ok: false,
192
+ score: 0,
193
+ message: `No rubric at .cclaw/evals/rubrics/${caseEntry.stage}.yaml. Add one before running --judge.`,
194
+ details: { stage: caseEntry.stage }
195
+ });
196
+ }
197
+ else {
198
+ try {
199
+ const invocation = await runJudge({
200
+ artifact,
201
+ rubric,
202
+ config,
203
+ client,
204
+ caseHint: expected.judge
205
+ });
206
+ caseCostUsd += invocation.usageUsd;
207
+ const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, expected.judge);
208
+ verifierResults.push(...judgeVerifiers);
209
+ }
210
+ catch (err) {
211
+ if (err instanceof DailyCostCapExceededError)
212
+ throw err;
213
+ const retryable = err instanceof EvalLlmError ? err.retryable : false;
214
+ verifierResults.push({
215
+ kind: "judge",
216
+ id: "judge:invocation:error",
217
+ ok: false,
218
+ score: 0,
219
+ message: err instanceof Error ? err.message : String(err),
220
+ details: { retryable, rubricId: rubric.id }
221
+ });
222
+ }
223
+ }
224
+ }
114
225
  const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
115
226
  const allOk = nonSkippedResults.length === 0
116
227
  ? verifierResults.every((r) => r.ok)
@@ -121,6 +232,7 @@ async function runCase(projectRoot, caseEntry, plannedTier, flags) {
121
232
  tier: plannedTier,
122
233
  passed: allOk,
123
234
  durationMs: Date.now() - started,
235
+ costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
124
236
  verifierResults
125
237
  };
126
238
  }
@@ -173,10 +285,13 @@ export async function runEval(options) {
173
285
  if (corpus.length === 0) {
174
286
  notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
175
287
  }
176
- if (options.judge) {
177
- notes.push("--judge is accepted; LLM judging is not wired yet.");
178
- }
179
288
  const flags = resolveRunFlags(options);
289
+ if (flags.runJudge && !config.apiKey && !options.llmClient) {
290
+ notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
291
+ }
292
+ if ((options.tier ?? "A") !== "A" && flags.runJudge) {
293
+ notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
294
+ }
180
295
  if (options.dryRun === true) {
181
296
  const summary = {
182
297
  kind: "dry-run",
@@ -190,17 +305,35 @@ export async function runEval(options) {
190
305
  verifiersAvailable: {
191
306
  structural: flags.runStructural,
192
307
  rules: flags.runRules,
193
- judge: false,
194
- workflow: false
308
+ judge: flags.runJudge,
309
+ workflow: flags.runAgent
195
310
  },
196
311
  notes
197
312
  };
198
313
  return summary;
199
314
  }
315
+ const costGuard = createCostGuard(options.projectRoot, config);
316
+ let wrappedClient;
317
+ if (flags.runJudge) {
318
+ const base = options.llmClient ?? createEvalClient(config);
319
+ wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
320
+ }
321
+ const rubrics = flags.runJudge
322
+ ? await loadAllRubrics(options.projectRoot)
323
+ : new Map();
200
324
  const now = new Date().toISOString();
201
325
  const caseResults = [];
202
326
  for (const item of corpus) {
203
- caseResults.push(await runCase(options.projectRoot, item, plannedTier, flags));
327
+ caseResults.push(await runCase({
328
+ projectRoot: options.projectRoot,
329
+ caseEntry: item,
330
+ plannedTier,
331
+ flags,
332
+ config,
333
+ client: wrappedClient,
334
+ costGuard,
335
+ rubrics
336
+ }));
204
337
  }
205
338
  const stages = stagesInResults(caseResults);
206
339
  const baselines = await loadBaselinesByStage(options.projectRoot, stages);
@@ -114,6 +114,31 @@ export interface TraceabilityExpected {
114
114
  */
115
115
  requireIn: string[];
116
116
  }
117
+ /**
118
+ * LLM-judge expectations — Step 3.
119
+ *
120
+ * When present, the judge runs against the resolved artifact (live-agent
121
+ * output in Tier A/B/C, or the pre-generated fixture when `--judge` is
122
+ * combined with `--schema-only` for smoke tests). Every field below is
123
+ * optional; the case-level hint overlays the stage-level rubric loaded
124
+ * from `.cclaw/evals/rubrics/<stage>.yaml`.
125
+ */
126
+ export interface JudgeExpected {
127
+ /**
128
+ * Per-case check ids that MUST be present in the stage rubric. Used when
129
+ * a case wants to assert the rubric covers scenario-specific properties.
130
+ */
131
+ requiredChecks?: string[];
132
+ /**
133
+ * Stage rubric identifier when a stage ships multiple rubrics (e.g.
134
+ * "strict" vs. "lenient"). Defaults to the stage name.
135
+ */
136
+ rubric?: string;
137
+ /** Optional override of `config.judgeSamples` for the case. */
138
+ samples?: number;
139
+ /** Per-check minimum score (1..5 scale). Fail when any score drops below. */
140
+ minimumScores?: Record<string, number>;
141
+ }
117
142
  /** Superset of per-verifier expectation shapes. */
118
143
  export interface ExpectedShape {
119
144
  structural?: StructuralExpected;
@@ -122,7 +147,7 @@ export interface ExpectedShape {
122
147
  /** Cross-stage ID propagation checks — Step 2. */
123
148
  traceability?: TraceabilityExpected;
124
149
  /** LLM-judge rubrics — Step 3. */
125
- judge?: Record<string, unknown>;
150
+ judge?: JudgeExpected;
126
151
  }
127
152
  /**
128
153
  * A single eval case describes one input scenario for one stage. Cases live in
@@ -228,6 +253,26 @@ export interface EvalConfig {
228
253
  timeoutMs: number;
229
254
  /** Max retries per API call on transient failures. */
230
255
  maxRetries: number;
256
+ /**
257
+ * Number of judge samples per case (median-of-N). Defaults to 3 when unset.
258
+ * Must be odd so a true median exists.
259
+ */
260
+ judgeSamples?: number;
261
+ /** Sampling temperature for judge calls. Defaults to 0.0. */
262
+ judgeTemperature?: number;
263
+ /** Sampling temperature for the agent-under-test. Defaults to 0.2. */
264
+ agentTemperature?: number;
265
+ /**
266
+ * Optional per-model USD pricing used by the cost guard. Keys match
267
+ * `model` / `judgeModel`. Values in USD per 1K tokens, so
268
+ * `{ input: 0.0005, output: 0.0015 }` = $0.50 per 1M input tokens.
269
+ */
270
+ tokenPricing?: Record<string, TokenPricing>;
271
+ }
272
+ /** Per-model pricing schedule, expressed as USD per 1K tokens. */
273
+ export interface TokenPricing {
274
+ input: number;
275
+ output: number;
231
276
  }
232
277
  /** Resolved config with env overrides applied. */
233
278
  export interface ResolvedEvalConfig extends EvalConfig {
@@ -279,3 +324,60 @@ export interface BaselineRegression {
279
324
  previousScore?: number;
280
325
  currentScore?: number;
281
326
  }
327
+ /**
328
+ * One rubric check evaluated by the LLM judge. Scored on a 1..5 scale;
329
+ * 5 means "the artifact fully meets the bar described by `prompt`".
330
+ */
331
+ export interface RubricCheck {
332
+ /** Kebab-case slug, unique per rubric. Stable across runs. */
333
+ id: string;
334
+ /** Natural-language question posed to the judge. */
335
+ prompt: string;
336
+ /** Human-readable scale description rendered in judge prompts. */
337
+ scale?: string;
338
+ /** Relative weight for the stage's aggregate score. Defaults to 1.0. */
339
+ weight?: number;
340
+ /**
341
+ * When true, any sample below `config.regression.failIfCriticalBelow`
342
+ * flips the verifier to `ok:false` (not just a score drop).
343
+ */
344
+ critical?: boolean;
345
+ }
346
+ /** Parsed `.cclaw/evals/rubrics/<stage>.yaml`. */
347
+ export interface RubricDoc {
348
+ stage: FlowStage;
349
+ /** Optional rubric variant label; defaults to the stage name. */
350
+ id: string;
351
+ checks: RubricCheck[];
352
+ }
353
+ /**
354
+ * Judge response for a single sample (one API call). The judge is asked to
355
+ * return structured JSON; `scores[id]` maps rubric check id → integer 1..5.
356
+ * `rationales[id]` is a short plain-text explanation, useful in reports but
357
+ * never used for gating.
358
+ */
359
+ export interface JudgeSample {
360
+ scores: Record<string, number>;
361
+ rationales: Record<string, string>;
362
+ }
363
+ /** Aggregated judge output across N samples, per rubric check. */
364
+ export interface JudgeAggregate {
365
+ checkId: string;
366
+ samples: number[];
367
+ median: number;
368
+ mean: number;
369
+ /** True iff every sample returned a score for this check. */
370
+ coverage: boolean;
371
+ }
372
+ /**
373
+ * Judge invocation result. Produced by `runJudge` and consumed by the
374
+ * runner: the runner converts each aggregate into a `VerifierResult` and
375
+ * records `usageUsd` toward the per-case cost.
376
+ */
377
+ export interface JudgeInvocation {
378
+ rubricId: string;
379
+ samples: JudgeSample[];
380
+ aggregates: JudgeAggregate[];
381
+ usageUsd: number;
382
+ durationMs: number;
383
+ }
@@ -0,0 +1,40 @@
1
+ /**
2
+ * LLM judge verifier — Step 3.
3
+ *
4
+ * Given an artifact and the stage's rubric, runs N judge samples (default
5
+ * median-of-3) against the configured LLM, aggregates the per-check
6
+ * scores, and returns one VerifierResult per rubric check plus one
7
+ * aggregate result covering the whole stage.
8
+ *
9
+ * Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
10
+ * so unit tests inject a stub EvalLlmClient and assert on the aggregate
11
+ * math without touching the network.
12
+ */
13
+ import { type EvalLlmClient } from "../llm-client.js";
14
+ import type { JudgeExpected, JudgeInvocation, JudgeSample, ResolvedEvalConfig, RubricDoc, VerifierResult } from "../types.js";
15
+ export interface RunJudgeOptions {
16
+ artifact: string;
17
+ rubric: RubricDoc;
18
+ config: Pick<ResolvedEvalConfig, "model" | "judgeModel" | "judgeSamples" | "judgeTemperature" | "timeoutMs" | "tokenPricing">;
19
+ client: EvalLlmClient;
20
+ /** Per-case hint that overlays the rubric (sample count, minimums). */
21
+ caseHint?: JudgeExpected;
22
+ /** Optional seed seed; incremented per sample for reproducibility. */
23
+ baseSeed?: number;
24
+ }
25
+ /**
26
+ * Parse one judge response into a JudgeSample. The parser is intentionally
27
+ * forgiving with rationales (missing -> empty string) but strict with
28
+ * scores: missing or non-numeric entries are dropped and the coverage
29
+ * flag on the aggregate flips to false.
30
+ */
31
+ export declare function parseJudgeResponse(content: string, rubric: RubricDoc): JudgeSample;
32
+ /** Run the judge against an artifact and return per-sample + aggregate data. */
33
+ export declare function runJudge(options: RunJudgeOptions): Promise<JudgeInvocation>;
34
+ /**
35
+ * Convert a JudgeInvocation into VerifierResult[] for the runner. One
36
+ * result per rubric check (score 0..1 normalized from the 1..5 median) +
37
+ * one "coverage" result that flips to `ok:false` when any sample failed
38
+ * to emit a score for a check.
39
+ */
40
+ export declare function judgeResultsToVerifiers(rubric: RubricDoc, invocation: JudgeInvocation, config: Pick<ResolvedEvalConfig, "regression">, caseHint?: JudgeExpected): VerifierResult[];