cclaw-cli 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,17 @@
1
1
  import { randomUUID } from "node:crypto";
2
2
  import { CCLAW_VERSION } from "../constants.js";
3
3
  import { FLOW_STAGES } from "../types.js";
4
+ import { runSingleShot } from "./agents/single-shot.js";
4
5
  import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
5
- import { loadCorpus, readFixtureArtifact } from "./corpus.js";
6
+ import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
6
7
  import { loadEvalConfig } from "./config-loader.js";
8
+ import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
9
+ import { createEvalClient, EvalLlmError } from "./llm-client.js";
10
+ import { loadAllRubrics } from "./rubric-loader.js";
11
+ import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
12
+ import { verifyRules } from "./verifiers/rules.js";
7
13
  import { verifyStructural } from "./verifiers/structural.js";
14
+ import { verifyTraceability } from "./verifiers/traceability.js";
8
15
  function groupByStage(cases) {
9
16
  return cases.reduce((acc, item) => {
10
17
  acc[item.stage] = (acc[item.stage] ?? 0) + 1;
@@ -21,33 +28,131 @@ function skeletonVerifierResult(message, details) {
21
28
  ...(details !== undefined ? { details } : {})
22
29
  };
23
30
  }
24
- async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
31
+ /**
32
+ * --schema-only narrows to structural. --rules opens up rules + traceability
33
+ * on top of structural (traceability is a rule-family verifier even though
34
+ * it lives in its own module). --judge opens up the LLM judge and, for
35
+ * Tier A, the single-shot agent-under-test. --schema-only always wins so
36
+ * the LLM-free PR gate never pays for tokens even if stale flags collide.
37
+ */
38
+ function resolveRunFlags(options) {
39
+ const rulesRequested = options.rules === true;
40
+ const schemaOnly = options.schemaOnly === true;
41
+ const judgeRequested = options.judge === true;
42
+ const runJudge = judgeRequested && !schemaOnly;
43
+ const runAgent = runJudge && (options.tier ?? "A") === "A";
44
+ return {
45
+ runStructural: true,
46
+ runRules: rulesRequested && !schemaOnly,
47
+ runTraceability: rulesRequested && !schemaOnly,
48
+ runJudge,
49
+ runAgent
50
+ };
51
+ }
52
+ /**
53
+ * Wrap a client so every chat() result is accounted against the cost
54
+ * guard before being returned. The guard throws
55
+ * DailyCostCapExceededError if committing the call would cross the
56
+ * configured cap — the runner surfaces that as a hard failure so
57
+ * nightly CI fails loud instead of silently overspending.
58
+ */
59
+ function wrapClientWithCostGuard(client, costGuard, fallbackModel) {
60
+ return {
61
+ async chat(request) {
62
+ const response = await client.chat(request);
63
+ await costGuard.commit(response.model || fallbackModel, response.usage);
64
+ return response;
65
+ }
66
+ };
67
+ }
68
+ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
69
+ try {
70
+ return await readFixtureArtifact(projectRoot, caseEntry);
71
+ }
72
+ catch (err) {
73
+ verifierResults.push({
74
+ kind: "structural",
75
+ id: "structural:fixture:missing",
76
+ ok: false,
77
+ score: 0,
78
+ message: err instanceof Error ? err.message : String(err),
79
+ details: { fixture: caseEntry.fixture }
80
+ });
81
+ return undefined;
82
+ }
83
+ }
84
+ async function runCase(ctx) {
85
+ const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
25
86
  const started = Date.now();
26
- const structuralExpected = caseEntry.expected?.structural;
27
87
  const verifierResults = [];
28
- if (!structuralExpected || Object.keys(structuralExpected).length === 0) {
29
- // No structural expectations declared — case is treated as "N/A" for this
30
- // verifier kind; a placeholder pass keeps downstream math simple while
31
- // making the situation visible in the report.
32
- verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
33
- }
34
- else {
35
- let artifact;
36
- try {
37
- artifact = await readFixtureArtifact(projectRoot, caseEntry);
88
+ const expected = caseEntry.expected;
89
+ let caseCostUsd = 0;
90
+ const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
91
+ const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
92
+ const hasTraceability = flags.runTraceability && !!expected?.traceability;
93
+ const judgeRequested = flags.runJudge && !!expected?.judge;
94
+ const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
95
+ let artifact;
96
+ if (needsArtifact) {
97
+ if (flags.runAgent && judgeRequested && client) {
98
+ try {
99
+ const produced = await runSingleShot({
100
+ caseEntry,
101
+ config,
102
+ projectRoot,
103
+ client
104
+ });
105
+ artifact = produced.artifact;
106
+ caseCostUsd += produced.usageUsd;
107
+ verifierResults.push({
108
+ kind: "workflow",
109
+ id: "agent:single-shot",
110
+ ok: true,
111
+ score: 1,
112
+ message: `single-shot agent produced ${produced.artifact.length} char(s) in ${produced.durationMs}ms`,
113
+ details: {
114
+ model: produced.model,
115
+ tokensIn: produced.usage.promptTokens,
116
+ tokensOut: produced.usage.completionTokens,
117
+ usageUsd: produced.usageUsd,
118
+ attempts: produced.attempts
119
+ }
120
+ });
121
+ }
122
+ catch (err) {
123
+ if (err instanceof DailyCostCapExceededError)
124
+ throw err;
125
+ const retryable = err instanceof EvalLlmError ? err.retryable : false;
126
+ verifierResults.push({
127
+ kind: "workflow",
128
+ id: "agent:single-shot",
129
+ ok: false,
130
+ score: 0,
131
+ message: err instanceof Error ? err.message : String(err),
132
+ details: { retryable }
133
+ });
134
+ }
38
135
  }
39
- catch (err) {
136
+ else {
137
+ artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
138
+ }
139
+ if (artifact === undefined && verifierResults.length === 0) {
40
140
  verifierResults.push({
41
141
  kind: "structural",
42
- id: "structural:fixture:missing",
142
+ id: "structural:fixture:absent",
43
143
  ok: false,
44
144
  score: 0,
45
- message: err instanceof Error ? err.message : String(err),
46
- details: { fixture: caseEntry.fixture }
145
+ message: "Expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
146
+ details: { fixtureProvided: false }
47
147
  });
48
148
  }
49
- if (artifact !== undefined) {
50
- const results = verifyStructural(artifact, structuralExpected);
149
+ }
150
+ if (flags.runStructural) {
151
+ if (!hasStructural) {
152
+ verifierResults.push(skeletonVerifierResult("No structural expectations declared for this case; structural verifier skipped.", { skipped: true }));
153
+ }
154
+ else if (artifact !== undefined) {
155
+ const results = verifyStructural(artifact, expected.structural);
51
156
  if (results.length === 0) {
52
157
  verifierResults.push(skeletonVerifierResult("Structural expectations parsed but produced zero checks.", { skipped: true }));
53
158
  }
@@ -55,24 +160,79 @@ async function runCaseStructural(projectRoot, caseEntry, plannedTier) {
55
160
  verifierResults.push(...results);
56
161
  }
57
162
  }
58
- else if (verifierResults.length === 0) {
163
+ }
164
+ if (hasRules && artifact !== undefined) {
165
+ const results = verifyRules(artifact, expected.rules);
166
+ verifierResults.push(...results);
167
+ }
168
+ if (hasTraceability && artifact !== undefined) {
169
+ try {
170
+ const extras = await readExtraFixtures(projectRoot, caseEntry);
171
+ const results = verifyTraceability(artifact, extras, expected.traceability);
172
+ verifierResults.push(...results);
173
+ }
174
+ catch (err) {
59
175
  verifierResults.push({
60
- kind: "structural",
61
- id: "structural:fixture:absent",
176
+ kind: "rules",
177
+ id: "traceability:fixture:missing",
62
178
  ok: false,
63
179
  score: 0,
64
- message: "Structural expectations declared but no fixture path provided. Add `fixture: ./<id>/fixture.md`.",
65
- details: { fixtureProvided: false }
180
+ message: err instanceof Error ? err.message : String(err),
181
+ details: { extraFixtures: Object.keys(caseEntry.extraFixtures ?? {}) }
66
182
  });
67
183
  }
68
184
  }
69
- const allOk = verifierResults.every((r) => r.ok);
185
+ if (judgeRequested && artifact !== undefined && client) {
186
+ const rubric = rubrics.get(caseEntry.stage);
187
+ if (!rubric) {
188
+ verifierResults.push({
189
+ kind: "judge",
190
+ id: "judge:rubric:missing",
191
+ ok: false,
192
+ score: 0,
193
+ message: `No rubric at .cclaw/evals/rubrics/${caseEntry.stage}.yaml. Add one before running --judge.`,
194
+ details: { stage: caseEntry.stage }
195
+ });
196
+ }
197
+ else {
198
+ try {
199
+ const invocation = await runJudge({
200
+ artifact,
201
+ rubric,
202
+ config,
203
+ client,
204
+ caseHint: expected.judge
205
+ });
206
+ caseCostUsd += invocation.usageUsd;
207
+ const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, expected.judge);
208
+ verifierResults.push(...judgeVerifiers);
209
+ }
210
+ catch (err) {
211
+ if (err instanceof DailyCostCapExceededError)
212
+ throw err;
213
+ const retryable = err instanceof EvalLlmError ? err.retryable : false;
214
+ verifierResults.push({
215
+ kind: "judge",
216
+ id: "judge:invocation:error",
217
+ ok: false,
218
+ score: 0,
219
+ message: err instanceof Error ? err.message : String(err),
220
+ details: { retryable, rubricId: rubric.id }
221
+ });
222
+ }
223
+ }
224
+ }
225
+ const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
226
+ const allOk = nonSkippedResults.length === 0
227
+ ? verifierResults.every((r) => r.ok)
228
+ : nonSkippedResults.every((r) => r.ok);
70
229
  return {
71
230
  caseId: caseEntry.id,
72
231
  stage: caseEntry.stage,
73
232
  tier: plannedTier,
74
233
  passed: allOk,
75
234
  durationMs: Date.now() - started,
235
+ costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
76
236
  verifierResults
77
237
  };
78
238
  }
@@ -125,11 +285,12 @@ export async function runEval(options) {
125
285
  if (corpus.length === 0) {
126
286
  notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
127
287
  }
128
- if (options.rules) {
129
- notes.push("--rules is accepted; rule verifiers are not wired yet.");
288
+ const flags = resolveRunFlags(options);
289
+ if (flags.runJudge && !config.apiKey && !options.llmClient) {
290
+ notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
130
291
  }
131
- if (options.judge) {
132
- notes.push("--judge is accepted; LLM judging is not wired yet.");
292
+ if ((options.tier ?? "A") !== "A" && flags.runJudge) {
293
+ notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
133
294
  }
134
295
  if (options.dryRun === true) {
135
296
  const summary = {
@@ -142,19 +303,37 @@ export async function runEval(options) {
142
303
  },
143
304
  plannedTier,
144
305
  verifiersAvailable: {
145
- structural: true,
146
- rules: false,
147
- judge: false,
148
- workflow: false
306
+ structural: flags.runStructural,
307
+ rules: flags.runRules,
308
+ judge: flags.runJudge,
309
+ workflow: flags.runAgent
149
310
  },
150
311
  notes
151
312
  };
152
313
  return summary;
153
314
  }
315
+ const costGuard = createCostGuard(options.projectRoot, config);
316
+ let wrappedClient;
317
+ if (flags.runJudge) {
318
+ const base = options.llmClient ?? createEvalClient(config);
319
+ wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
320
+ }
321
+ const rubrics = flags.runJudge
322
+ ? await loadAllRubrics(options.projectRoot)
323
+ : new Map();
154
324
  const now = new Date().toISOString();
155
325
  const caseResults = [];
156
326
  for (const item of corpus) {
157
- caseResults.push(await runCaseStructural(options.projectRoot, item, plannedTier));
327
+ caseResults.push(await runCase({
328
+ projectRoot: options.projectRoot,
329
+ caseEntry: item,
330
+ plannedTier,
331
+ flags,
332
+ config,
333
+ client: wrappedClient,
334
+ costGuard,
335
+ rubrics
336
+ }));
158
337
  }
159
338
  const stages = stagesInResults(caseResults);
160
339
  const baselines = await loadBaselinesByStage(options.projectRoot, stages);
@@ -58,13 +58,96 @@ export interface StructuralExpected {
58
58
  */
59
59
  requiredFrontmatterKeys?: string[];
60
60
  }
61
- /** Superset of per-verifier expectation shapes. Only `structural` is wired in Step 1. */
61
+ /**
62
+ * Rule-based expectations — zero-LLM content checks that are richer than
63
+ * structural (regex, numeric bounds, uniqueness). Introduced in Step 2.
64
+ *
65
+ * Every array field is optional; an empty `RulesExpected` produces zero
66
+ * verifier results so authors can enable rules incrementally.
67
+ */
68
+ export interface RulesExpected {
69
+ /** Case-insensitive substrings the body must include at least once. */
70
+ mustContain?: string[];
71
+ /** Case-insensitive substrings the body must NOT include. */
72
+ mustNotContain?: string[];
73
+ /** Regex patterns that must match the body at least once. */
74
+ regexRequired?: RuleRegex[];
75
+ /** Regex patterns that must NOT match the body. */
76
+ regexForbidden?: RuleRegex[];
77
+ /** For each substring key, the body must contain at least N occurrences. */
78
+ minOccurrences?: Record<string, number>;
79
+ /** For each substring key, the body must contain at most N occurrences. */
80
+ maxOccurrences?: Record<string, number>;
81
+ /**
82
+ * For each named section (case-insensitive heading substring), every bullet
83
+ * (`- ...`) directly under the section must be unique. Catches duplicated
84
+ * decisions or repeated risks.
85
+ */
86
+ uniqueBulletsInSection?: string[];
87
+ }
88
+ export interface RuleRegex {
89
+ /** Source of the regex. Parsed with `new RegExp(pattern, flags)`. */
90
+ pattern: string;
91
+ /** Optional regex flags; defaults to `"i"` for case-insensitive matching. */
92
+ flags?: string;
93
+ /** Human-readable label rendered in verifier messages and slugged into the id. */
94
+ description?: string;
95
+ }
96
+ /**
97
+ * Cross-stage traceability expectations — assert every ID extracted from
98
+ * `source` also appears in `self` and/or named `extra_fixtures`. Introduced
99
+ * in Step 2.
100
+ */
101
+ export interface TraceabilityExpected {
102
+ /** Regex applied to the `source` fixture to collect the authoritative ID set. */
103
+ idPattern: string;
104
+ /** Optional regex flags (defaults to `"g"`). */
105
+ idFlags?: string;
106
+ /**
107
+ * Where to read the authoritative ID set from. Either `"self"` (the case's
108
+ * primary `fixture`) or a label present in the case's `extraFixtures` map.
109
+ */
110
+ source: string;
111
+ /**
112
+ * Where every source ID must also appear. Each entry is `"self"` or an
113
+ * `extraFixtures` label. Order is preserved for deterministic result ids.
114
+ */
115
+ requireIn: string[];
116
+ }
117
+ /**
118
+ * LLM-judge expectations — Step 3.
119
+ *
120
+ * When present, the judge runs against the resolved artifact (live-agent
121
+ * output in Tier A/B/C, or the pre-generated fixture when `--judge` is
122
+ * combined with `--schema-only` for smoke tests). Every field below is
123
+ * optional; the case-level hint overlays the stage-level rubric loaded
124
+ * from `.cclaw/evals/rubrics/<stage>.yaml`.
125
+ */
126
+ export interface JudgeExpected {
127
+ /**
128
+ * Per-case check ids that MUST be present in the stage rubric. Used when
129
+ * a case wants to assert the rubric covers scenario-specific properties.
130
+ */
131
+ requiredChecks?: string[];
132
+ /**
133
+ * Stage rubric identifier when a stage ships multiple rubrics (e.g.
134
+ * "strict" vs. "lenient"). Defaults to the stage name.
135
+ */
136
+ rubric?: string;
137
+ /** Optional override of `config.judgeSamples` for the case. */
138
+ samples?: number;
139
+ /** Per-check minimum score (1..5 scale). Fail when any score drops below. */
140
+ minimumScores?: Record<string, number>;
141
+ }
142
+ /** Superset of per-verifier expectation shapes. */
62
143
  export interface ExpectedShape {
63
144
  structural?: StructuralExpected;
64
- /** Rule-based (keyword/regex/traceability) checks — Step 2. */
65
- rules?: Record<string, unknown>;
145
+ /** Rule-based (keyword/regex/count/uniqueness) checks — Step 2. */
146
+ rules?: RulesExpected;
147
+ /** Cross-stage ID propagation checks — Step 2. */
148
+ traceability?: TraceabilityExpected;
66
149
  /** LLM-judge rubrics — Step 3. */
67
- judge?: Record<string, unknown>;
150
+ judge?: JudgeExpected;
68
151
  }
69
152
  /**
70
153
  * A single eval case describes one input scenario for one stage. Cases live in
@@ -89,6 +172,13 @@ export interface EvalCase {
89
172
  * Step 1 development aid.
90
173
  */
91
174
  fixture?: string;
175
+ /**
176
+ * Additional fixture paths loaded alongside the primary `fixture`, keyed
177
+ * by a free-form label. Consumed by cross-artifact verifiers (e.g.,
178
+ * traceability) introduced in Step 2. Paths are resolved relative to the
179
+ * case's stage directory, just like `fixture`.
180
+ */
181
+ extraFixtures?: Record<string, string>;
92
182
  }
93
183
  /** Result of one verifier applied to one case. */
94
184
  export interface VerifierResult {
@@ -163,6 +253,26 @@ export interface EvalConfig {
163
253
  timeoutMs: number;
164
254
  /** Max retries per API call on transient failures. */
165
255
  maxRetries: number;
256
+ /**
257
+ * Number of judge samples per case (median-of-N). Defaults to 3 when unset.
258
+ * Must be odd so a true median exists.
259
+ */
260
+ judgeSamples?: number;
261
+ /** Sampling temperature for judge calls. Defaults to 0.0. */
262
+ judgeTemperature?: number;
263
+ /** Sampling temperature for the agent-under-test. Defaults to 0.2. */
264
+ agentTemperature?: number;
265
+ /**
266
+ * Optional per-model USD pricing used by the cost guard. Keys match
267
+ * `model` / `judgeModel`. Values in USD per 1K tokens, so
268
+ * `{ input: 0.0005, output: 0.0015 }` = $0.50 per 1M input tokens.
269
+ */
270
+ tokenPricing?: Record<string, TokenPricing>;
271
+ }
272
+ /** Per-model pricing schedule, expressed as USD per 1K tokens. */
273
+ export interface TokenPricing {
274
+ input: number;
275
+ output: number;
166
276
  }
167
277
  /** Resolved config with env overrides applied. */
168
278
  export interface ResolvedEvalConfig extends EvalConfig {
@@ -214,3 +324,60 @@ export interface BaselineRegression {
214
324
  previousScore?: number;
215
325
  currentScore?: number;
216
326
  }
327
+ /**
328
+ * One rubric check evaluated by the LLM judge. Scored on a 1..5 scale;
329
+ * 5 means "the artifact fully meets the bar described by `prompt`".
330
+ */
331
+ export interface RubricCheck {
332
+ /** Kebab-case slug, unique per rubric. Stable across runs. */
333
+ id: string;
334
+ /** Natural-language question posed to the judge. */
335
+ prompt: string;
336
+ /** Human-readable scale description rendered in judge prompts. */
337
+ scale?: string;
338
+ /** Relative weight for the stage's aggregate score. Defaults to 1.0. */
339
+ weight?: number;
340
+ /**
341
+ * When true, any sample below `config.regression.failIfCriticalBelow`
342
+ * flips the verifier to `ok:false` (not just a score drop).
343
+ */
344
+ critical?: boolean;
345
+ }
346
+ /** Parsed `.cclaw/evals/rubrics/<stage>.yaml`. */
347
+ export interface RubricDoc {
348
+ stage: FlowStage;
349
+ /** Optional rubric variant label; defaults to the stage name. */
350
+ id: string;
351
+ checks: RubricCheck[];
352
+ }
353
+ /**
354
+ * Judge response for a single sample (one API call). The judge is asked to
355
+ * return structured JSON; `scores[id]` maps rubric check id → integer 1..5.
356
+ * `rationales[id]` is a short plain-text explanation, useful in reports but
357
+ * never used for gating.
358
+ */
359
+ export interface JudgeSample {
360
+ scores: Record<string, number>;
361
+ rationales: Record<string, string>;
362
+ }
363
+ /** Aggregated judge output across N samples, per rubric check. */
364
+ export interface JudgeAggregate {
365
+ checkId: string;
366
+ samples: number[];
367
+ median: number;
368
+ mean: number;
369
+ /** True iff every sample returned a score for this check. */
370
+ coverage: boolean;
371
+ }
372
+ /**
373
+ * Judge invocation result. Produced by `runJudge` and consumed by the
374
+ * runner: the runner converts each aggregate into a `VerifierResult` and
375
+ * records `usageUsd` toward the per-case cost.
376
+ */
377
+ export interface JudgeInvocation {
378
+ rubricId: string;
379
+ samples: JudgeSample[];
380
+ aggregates: JudgeAggregate[];
381
+ usageUsd: number;
382
+ durationMs: number;
383
+ }
@@ -0,0 +1,40 @@
1
+ /**
2
+ * LLM judge verifier — Step 3.
3
+ *
4
+ * Given an artifact and the stage's rubric, runs N judge samples (default
5
+ * median-of-3) against the configured LLM, aggregates the per-check
6
+ * scores, and returns one VerifierResult per rubric check plus one
7
+ * aggregate result covering the whole stage.
8
+ *
9
+ * Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
10
+ * so unit tests inject a stub EvalLlmClient and assert on the aggregate
11
+ * math without touching the network.
12
+ */
13
+ import { type EvalLlmClient } from "../llm-client.js";
14
+ import type { JudgeExpected, JudgeInvocation, JudgeSample, ResolvedEvalConfig, RubricDoc, VerifierResult } from "../types.js";
15
+ export interface RunJudgeOptions {
16
+ artifact: string;
17
+ rubric: RubricDoc;
18
+ config: Pick<ResolvedEvalConfig, "model" | "judgeModel" | "judgeSamples" | "judgeTemperature" | "timeoutMs" | "tokenPricing">;
19
+ client: EvalLlmClient;
20
+ /** Per-case hint that overlays the rubric (sample count, minimums). */
21
+ caseHint?: JudgeExpected;
22
+ /** Optional seed seed; incremented per sample for reproducibility. */
23
+ baseSeed?: number;
24
+ }
25
+ /**
26
+ * Parse one judge response into a JudgeSample. The parser is intentionally
27
+ * forgiving with rationales (missing -> empty string) but strict with
28
+ * scores: missing or non-numeric entries are dropped and the coverage
29
+ * flag on the aggregate flips to false.
30
+ */
31
+ export declare function parseJudgeResponse(content: string, rubric: RubricDoc): JudgeSample;
32
+ /** Run the judge against an artifact and return per-sample + aggregate data. */
33
+ export declare function runJudge(options: RunJudgeOptions): Promise<JudgeInvocation>;
34
+ /**
35
+ * Convert a JudgeInvocation into VerifierResult[] for the runner. One
36
+ * result per rubric check (score 0..1 normalized from the 1..5 median) +
37
+ * one "coverage" result that flips to `ok:false` when any sample failed
38
+ * to emit a score for a check.
39
+ */
40
+ export declare function judgeResultsToVerifiers(rubric: RubricDoc, invocation: JudgeInvocation, config: Pick<ResolvedEvalConfig, "regression">, caseHint?: JudgeExpected): VerifierResult[];