npm - cclaw-cli - Versions diffs - 0.24.0 → 0.25.0 - Mend

cclaw-cli 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/cli.js +2 -1
package/dist/content/eval-scaffold.d.ts +5 -1
package/dist/content/eval-scaffold.js +284 -3
package/dist/eval/agents/single-shot.d.ts +27 -0
package/dist/eval/agents/single-shot.js +79 -0
package/dist/eval/config-loader.js +96 -3
package/dist/eval/cost-guard.d.ts +80 -0
package/dist/eval/cost-guard.js +153 -0
package/dist/eval/llm-client.d.ts +113 -20
package/dist/eval/llm-client.js +242 -10
package/dist/eval/report.js +26 -0
package/dist/eval/rubric-loader.d.ts +20 -0
package/dist/eval/rubric-loader.js +143 -0
package/dist/eval/runner.d.ts +7 -0
package/dist/eval/runner.js +145 -12
package/dist/eval/types.d.ts +103 -1
package/dist/eval/verifiers/judge.d.ts +40 -0
package/dist/eval/verifiers/judge.js +256 -0
package/dist/install.js +7 -1
package/package.json +2 -1

package/dist/eval/rubric-loader.js ADDED Viewed

@@ -0,0 +1,143 @@
+/**
+ * Loader + validator for `.cclaw/evals/rubrics/<stage>.yaml`.
+ *
+ * Each file maps to exactly one `RubricDoc` that drives the LLM judge.
+ * Validation is strict: unknown top-level keys, missing required fields,
+ * duplicate check ids, and malformed weights all surface as actionable
+ * errors rather than turning into silent "judge had nothing to score"
+ * passes.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+import { parse } from "yaml";
+import { EVALS_ROOT } from "../constants.js";
+import { exists } from "../fs-utils.js";
+import { FLOW_STAGES } from "../types.js";
+export function rubricsDir(projectRoot) {
+    return path.join(projectRoot, EVALS_ROOT, "rubrics");
+}
+export function rubricPath(projectRoot, stage) {
+    return path.join(rubricsDir(projectRoot), `${stage}.yaml`);
+}
+function rubricError(file, reason) {
+    return new Error(`Invalid rubric at ${file}: ${reason}\n` +
+        `See docs/evals.md for the rubric schema. Fields: stage (required), id (optional, defaults to stage), checks[] with id + prompt.`);
+}
+function isRecord(value) {
+    return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+function validateCheck(raw, index, file) {
+    if (!isRecord(raw)) {
+        throw rubricError(file, `checks[${index}] must be a mapping`);
+    }
+    const id = raw.id;
+    if (typeof id !== "string" || id.trim().length === 0) {
+        throw rubricError(file, `checks[${index}].id must be a non-empty string`);
+    }
+    if (!/^[a-z][a-z0-9-]*$/.test(id)) {
+        throw rubricError(file, `checks[${index}].id "${id}" must be kebab-case (lowercase letters, digits, hyphen; starts with a letter)`);
+    }
+    const prompt = raw.prompt;
+    if (typeof prompt !== "string" || prompt.trim().length === 0) {
+        throw rubricError(file, `checks[${index}].prompt must be a non-empty string`);
+    }
+    const check = {
+        id,
+        prompt: prompt.trim()
+    };
+    if (raw.scale !== undefined) {
+        if (typeof raw.scale !== "string" || raw.scale.trim().length === 0) {
+            throw rubricError(file, `checks[${index}].scale must be a non-empty string when provided`);
+        }
+        check.scale = raw.scale.trim();
+    }
+    if (raw.weight !== undefined) {
+        if (typeof raw.weight !== "number" || !Number.isFinite(raw.weight) || raw.weight < 0) {
+            throw rubricError(file, `checks[${index}].weight must be a non-negative number when provided`);
+        }
+        check.weight = raw.weight;
+    }
+    if (raw.critical !== undefined) {
+        if (typeof raw.critical !== "boolean") {
+            throw rubricError(file, `checks[${index}].critical must be a boolean when provided`);
+        }
+        check.critical = raw.critical;
+    }
+    const known = new Set(["id", "prompt", "scale", "weight", "critical"]);
+    const unknown = Object.keys(raw).filter((key) => !known.has(key));
+    if (unknown.length > 0) {
+        throw rubricError(file, `checks[${index}] has unknown key(s): ${unknown.join(", ")}`);
+    }
+    return check;
+}
+function validateRubric(raw, file) {
+    if (!isRecord(raw)) {
+        throw rubricError(file, "top-level value must be a mapping");
+    }
+    const stage = raw.stage;
+    if (typeof stage !== "string" || !FLOW_STAGES.includes(stage)) {
+        throw rubricError(file, `"stage" must be one of: ${FLOW_STAGES.join(", ")} (got: ${JSON.stringify(stage)})`);
+    }
+    const id = raw.id;
+    let rubricId = stage;
+    if (id !== undefined) {
+        if (typeof id !== "string" || id.trim().length === 0) {
+            throw rubricError(file, `"id" must be a non-empty string when provided`);
+        }
+        rubricId = id.trim();
+    }
+    const checks = raw.checks;
+    if (!Array.isArray(checks) || checks.length === 0) {
+        throw rubricError(file, `"checks" must be a non-empty array`);
+    }
+    const parsed = [];
+    const seen = new Set();
+    for (let i = 0; i < checks.length; i += 1) {
+        const check = validateCheck(checks[i], i, file);
+        if (seen.has(check.id)) {
+            throw rubricError(file, `duplicate check id: "${check.id}"`);
+        }
+        seen.add(check.id);
+        parsed.push(check);
+    }
+    const known = new Set(["stage", "id", "checks"]);
+    const unknown = Object.keys(raw).filter((key) => !known.has(key));
+    if (unknown.length > 0) {
+        throw rubricError(file, `unknown top-level key(s): ${unknown.join(", ")}`);
+    }
+    return {
+        stage: stage,
+        id: rubricId,
+        checks: parsed
+    };
+}
+/**
+ * Load the rubric for `stage`. Returns `undefined` when the file is
+ * missing so callers can emit a "no rubric" verifier result rather than
+ * crashing — authors are expected to grow rubrics incrementally.
+ */
+export async function loadRubric(projectRoot, stage) {
+    const file = rubricPath(projectRoot, stage);
+    if (!(await exists(file)))
+        return undefined;
+    let parsed;
+    try {
+        parsed = parse(await fs.readFile(file, "utf8"));
+    }
+    catch (err) {
+        throw rubricError(file, err instanceof Error ? err.message : String(err));
+    }
+    return validateRubric(parsed, file);
+}
+/** Load every rubric present in the given rubrics directory. */
+export async function loadAllRubrics(projectRoot) {
+    const out = new Map();
+    for (const stage of FLOW_STAGES) {
+        const doc = await loadRubric(projectRoot, stage);
+        if (doc)
+            out.set(stage, doc);
+    }
+    return out;
+}
+/** Exposed for tests. */
+export const __internal = { validateRubric, validateCheck };

package/dist/eval/runner.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { FlowStage } from "../types.js";
+import { type EvalLlmClient } from "./llm-client.js";
 import type { EvalReport, EvalTier, ResolvedEvalConfig } from "./types.js";
 export interface RunEvalOptions {
     projectRoot: string;
@@ -14,6 +15,12 @@ export interface RunEvalOptions {
     dryRun?: boolean;
     /** Override process.env during tests. */
     env?: NodeJS.ProcessEnv;
+    /**
+     * Optional LLM client injection. Primary use case: unit and
+     * integration tests that want deterministic judge + agent behavior
+     * without hitting the network.
+     */
+    llmClient?: EvalLlmClient;
 }
 export interface DryRunSummary {
     kind: "dry-run";

package/dist/eval/runner.js CHANGED Viewed

@@ -1,9 +1,14 @@
 import { randomUUID } from "node:crypto";
 import { CCLAW_VERSION } from "../constants.js";
 import { FLOW_STAGES } from "../types.js";
+import { runSingleShot } from "./agents/single-shot.js";
 import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
 import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
 import { loadEvalConfig } from "./config-loader.js";
+import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
+import { createEvalClient, EvalLlmError } from "./llm-client.js";
+import { loadAllRubrics } from "./rubric-loader.js";
+import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
 import { verifyRules } from "./verifiers/rules.js";
 import { verifyStructural } from "./verifiers/structural.js";
 import { verifyTraceability } from "./verifiers/traceability.js";
@@ -26,16 +31,38 @@ function skeletonVerifierResult(message, details) {
 /**
  * --schema-only narrows to structural. --rules opens up rules + traceability
  * on top of structural (traceability is a rule-family verifier even though
- * it lives in its own module). Default (no flag) matches --schema-only for
- * backwards compatibility with the Step 1 gate.
+ * it lives in its own module). --judge opens up the LLM judge and, for
+ * Tier A, the single-shot agent-under-test. --schema-only always wins so
+ * the LLM-free PR gate never pays for tokens even if stale flags collide.
  */
 function resolveRunFlags(options) {
     const rulesRequested = options.rules === true;
     const schemaOnly = options.schemaOnly === true;
+    const judgeRequested = options.judge === true;
+    const runJudge = judgeRequested && !schemaOnly;
+    const runAgent = runJudge && (options.tier ?? "A") === "A";
     return {
         runStructural: true,
         runRules: rulesRequested && !schemaOnly,
-        runTraceability: rulesRequested && !schemaOnly
+        runTraceability: rulesRequested && !schemaOnly,
+        runJudge,
+        runAgent
+    };
+}
+/**
+ * Wrap a client so every chat() result is accounted against the cost
+ * guard before being returned. The guard throws
+ * DailyCostCapExceededError if committing the call would cross the
+ * configured cap — the runner surfaces that as a hard failure so
+ * nightly CI fails loud instead of silently overspending.
+ */
+function wrapClientWithCostGuard(client, costGuard, fallbackModel) {
+    return {
+        async chat(request) {
+            const response = await client.chat(request);
+            await costGuard.commit(response.model || fallbackModel, response.usage);
+            return response;
+        }
     };
 }
 async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
@@ -54,17 +81,61 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
         return undefined;
     }
 }
-async function runCase(projectRoot, caseEntry, plannedTier, flags) {
+async function runCase(ctx) {
+    const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
     const started = Date.now();
     const verifierResults = [];
     const expected = caseEntry.expected;
+    let caseCostUsd = 0;
     const hasStructural = !!expected?.structural && Object.keys(expected.structural).length > 0;
     const hasRules = flags.runRules && !!expected?.rules && Object.keys(expected.rules).length > 0;
     const hasTraceability = flags.runTraceability && !!expected?.traceability;
-    const needsArtifact = hasStructural || hasRules || hasTraceability;
+    const judgeRequested = flags.runJudge && !!expected?.judge;
+    const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
     let artifact;
     if (needsArtifact) {
-        artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
+        if (flags.runAgent && judgeRequested && client) {
+            try {
+                const produced = await runSingleShot({
+                    caseEntry,
+                    config,
+                    projectRoot,
+                    client
+                });
+                artifact = produced.artifact;
+                caseCostUsd += produced.usageUsd;
+                verifierResults.push({
+                    kind: "workflow",
+                    id: "agent:single-shot",
+                    ok: true,
+                    score: 1,
+                    message: `single-shot agent produced ${produced.artifact.length} char(s) in ${produced.durationMs}ms`,
+                    details: {
+                        model: produced.model,
+                        tokensIn: produced.usage.promptTokens,
+                        tokensOut: produced.usage.completionTokens,
+                        usageUsd: produced.usageUsd,
+                        attempts: produced.attempts
+                    }
+                });
+            }
+            catch (err) {
+                if (err instanceof DailyCostCapExceededError)
+                    throw err;
+                const retryable = err instanceof EvalLlmError ? err.retryable : false;
+                verifierResults.push({
+                    kind: "workflow",
+                    id: "agent:single-shot",
+                    ok: false,
+                    score: 0,
+                    message: err instanceof Error ? err.message : String(err),
+                    details: { retryable }
+                });
+            }
+        }
+        else {
+            artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
+        }
         if (artifact === undefined && verifierResults.length === 0) {
             verifierResults.push({
                 kind: "structural",
@@ -111,6 +182,46 @@ async function runCase(projectRoot, caseEntry, plannedTier, flags) {
             });
         }
     }
+    if (judgeRequested && artifact !== undefined && client) {
+        const rubric = rubrics.get(caseEntry.stage);
+        if (!rubric) {
+            verifierResults.push({
+                kind: "judge",
+                id: "judge:rubric:missing",
+                ok: false,
+                score: 0,
+                message: `No rubric at .cclaw/evals/rubrics/${caseEntry.stage}.yaml. Add one before running --judge.`,
+                details: { stage: caseEntry.stage }
+            });
+        }
+        else {
+            try {
+                const invocation = await runJudge({
+                    artifact,
+                    rubric,
+                    config,
+                    client,
+                    caseHint: expected.judge
+                });
+                caseCostUsd += invocation.usageUsd;
+                const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, expected.judge);
+                verifierResults.push(...judgeVerifiers);
+            }
+            catch (err) {
+                if (err instanceof DailyCostCapExceededError)
+                    throw err;
+                const retryable = err instanceof EvalLlmError ? err.retryable : false;
+                verifierResults.push({
+                    kind: "judge",
+                    id: "judge:invocation:error",
+                    ok: false,
+                    score: 0,
+                    message: err instanceof Error ? err.message : String(err),
+                    details: { retryable, rubricId: rubric.id }
+                });
+            }
+        }
+    }
     const nonSkippedResults = verifierResults.filter((r) => r.details?.skipped !== true);
     const allOk = nonSkippedResults.length === 0
         ? verifierResults.every((r) => r.ok)
@@ -121,6 +232,7 @@ async function runCase(projectRoot, caseEntry, plannedTier, flags) {
         tier: plannedTier,
         passed: allOk,
         durationMs: Date.now() - started,
+        costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
         verifierResults
     };
 }
@@ -173,10 +285,13 @@ export async function runEval(options) {
     if (corpus.length === 0) {
         notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
     }
-    if (options.judge) {
-        notes.push("--judge is accepted; LLM judging is not wired yet.");
-    }
     const flags = resolveRunFlags(options);
+    if (flags.runJudge && !config.apiKey && !options.llmClient) {
+        notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
+    }
+    if ((options.tier ?? "A") !== "A" && flags.runJudge) {
+        notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
+    }
     if (options.dryRun === true) {
         const summary = {
             kind: "dry-run",
@@ -190,17 +305,35 @@ export async function runEval(options) {
             verifiersAvailable: {
                 structural: flags.runStructural,
                 rules: flags.runRules,
-                judge: false,
-                workflow: false
+                judge: flags.runJudge,
+                workflow: flags.runAgent
             },
             notes
         };
         return summary;
     }
+    const costGuard = createCostGuard(options.projectRoot, config);
+    let wrappedClient;
+    if (flags.runJudge) {
+        const base = options.llmClient ?? createEvalClient(config);
+        wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
+    }
+    const rubrics = flags.runJudge
+        ? await loadAllRubrics(options.projectRoot)
+        : new Map();
     const now = new Date().toISOString();
     const caseResults = [];
     for (const item of corpus) {
-        caseResults.push(await runCase(options.projectRoot, item, plannedTier, flags));
+        caseResults.push(await runCase({
+            projectRoot: options.projectRoot,
+            caseEntry: item,
+            plannedTier,
+            flags,
+            config,
+            client: wrappedClient,
+            costGuard,
+            rubrics
+        }));
     }
     const stages = stagesInResults(caseResults);
     const baselines = await loadBaselinesByStage(options.projectRoot, stages);

package/dist/eval/types.d.ts CHANGED Viewed

@@ -114,6 +114,31 @@ export interface TraceabilityExpected {
      */
     requireIn: string[];
 }
+/**
+ * LLM-judge expectations — Step 3.
+ *
+ * When present, the judge runs against the resolved artifact (live-agent
+ * output in Tier A/B/C, or the pre-generated fixture when `--judge` is
+ * combined with `--schema-only` for smoke tests). Every field below is
+ * optional; the case-level hint overlays the stage-level rubric loaded
+ * from `.cclaw/evals/rubrics/<stage>.yaml`.
+ */
+export interface JudgeExpected {
+    /**
+     * Per-case check ids that MUST be present in the stage rubric. Used when
+     * a case wants to assert the rubric covers scenario-specific properties.
+     */
+    requiredChecks?: string[];
+    /**
+     * Stage rubric identifier when a stage ships multiple rubrics (e.g.
+     * "strict" vs. "lenient"). Defaults to the stage name.
+     */
+    rubric?: string;
+    /** Optional override of `config.judgeSamples` for the case. */
+    samples?: number;
+    /** Per-check minimum score (1..5 scale). Fail when any score drops below. */
+    minimumScores?: Record<string, number>;
+}
 /** Superset of per-verifier expectation shapes. */
 export interface ExpectedShape {
     structural?: StructuralExpected;
@@ -122,7 +147,7 @@ export interface ExpectedShape {
     /** Cross-stage ID propagation checks — Step 2. */
     traceability?: TraceabilityExpected;
     /** LLM-judge rubrics — Step 3. */
-    judge?: Record<string, unknown>;
+    judge?: JudgeExpected;
 }
 /**
  * A single eval case describes one input scenario for one stage. Cases live in
@@ -228,6 +253,26 @@ export interface EvalConfig {
     timeoutMs: number;
     /** Max retries per API call on transient failures. */
     maxRetries: number;
+    /**
+     * Number of judge samples per case (median-of-N). Defaults to 3 when unset.
+     * Must be odd so a true median exists.
+     */
+    judgeSamples?: number;
+    /** Sampling temperature for judge calls. Defaults to 0.0. */
+    judgeTemperature?: number;
+    /** Sampling temperature for the agent-under-test. Defaults to 0.2. */
+    agentTemperature?: number;
+    /**
+     * Optional per-model USD pricing used by the cost guard. Keys match
+     * `model` / `judgeModel`. Values in USD per 1K tokens, so
+     * `{ input: 0.0005, output: 0.0015 }` = $0.50 per 1M input tokens.
+     */
+    tokenPricing?: Record<string, TokenPricing>;
+}
+/** Per-model pricing schedule, expressed as USD per 1K tokens. */
+export interface TokenPricing {
+    input: number;
+    output: number;
 }
 /** Resolved config with env overrides applied. */
 export interface ResolvedEvalConfig extends EvalConfig {
@@ -279,3 +324,60 @@ export interface BaselineRegression {
     previousScore?: number;
     currentScore?: number;
 }
+/**
+ * One rubric check evaluated by the LLM judge. Scored on a 1..5 scale;
+ * 5 means "the artifact fully meets the bar described by `prompt`".
+ */
+export interface RubricCheck {
+    /** Kebab-case slug, unique per rubric. Stable across runs. */
+    id: string;
+    /** Natural-language question posed to the judge. */
+    prompt: string;
+    /** Human-readable scale description rendered in judge prompts. */
+    scale?: string;
+    /** Relative weight for the stage's aggregate score. Defaults to 1.0. */
+    weight?: number;
+    /**
+     * When true, any sample below `config.regression.failIfCriticalBelow`
+     * flips the verifier to `ok:false` (not just a score drop).
+     */
+    critical?: boolean;
+}
+/** Parsed `.cclaw/evals/rubrics/<stage>.yaml`. */
+export interface RubricDoc {
+    stage: FlowStage;
+    /** Optional rubric variant label; defaults to the stage name. */
+    id: string;
+    checks: RubricCheck[];
+}
+/**
+ * Judge response for a single sample (one API call). The judge is asked to
+ * return structured JSON; `scores[id]` maps rubric check id → integer 1..5.
+ * `rationales[id]` is a short plain-text explanation, useful in reports but
+ * never used for gating.
+ */
+export interface JudgeSample {
+    scores: Record<string, number>;
+    rationales: Record<string, string>;
+}
+/** Aggregated judge output across N samples, per rubric check. */
+export interface JudgeAggregate {
+    checkId: string;
+    samples: number[];
+    median: number;
+    mean: number;
+    /** True iff every sample returned a score for this check. */
+    coverage: boolean;
+}
+/**
+ * Judge invocation result. Produced by `runJudge` and consumed by the
+ * runner: the runner converts each aggregate into a `VerifierResult` and
+ * records `usageUsd` toward the per-case cost.
+ */
+export interface JudgeInvocation {
+    rubricId: string;
+    samples: JudgeSample[];
+    aggregates: JudgeAggregate[];
+    usageUsd: number;
+    durationMs: number;
+}

package/dist/eval/verifiers/judge.d.ts ADDED Viewed

@@ -0,0 +1,40 @@
+/**
+ * LLM judge verifier — Step 3.
+ *
+ * Given an artifact and the stage's rubric, runs N judge samples (default
+ * median-of-3) against the configured LLM, aggregates the per-check
+ * scores, and returns one VerifierResult per rubric check plus one
+ * aggregate result covering the whole stage.
+ *
+ * Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
+ * so unit tests inject a stub EvalLlmClient and assert on the aggregate
+ * math without touching the network.
+ */
+import { type EvalLlmClient } from "../llm-client.js";
+import type { JudgeExpected, JudgeInvocation, JudgeSample, ResolvedEvalConfig, RubricDoc, VerifierResult } from "../types.js";
+export interface RunJudgeOptions {
+    artifact: string;
+    rubric: RubricDoc;
+    config: Pick<ResolvedEvalConfig, "model" | "judgeModel" | "judgeSamples" | "judgeTemperature" | "timeoutMs" | "tokenPricing">;
+    client: EvalLlmClient;
+    /** Per-case hint that overlays the rubric (sample count, minimums). */
+    caseHint?: JudgeExpected;
+    /** Optional seed seed; incremented per sample for reproducibility. */
+    baseSeed?: number;
+}
+/**
+ * Parse one judge response into a JudgeSample. The parser is intentionally
+ * forgiving with rationales (missing -> empty string) but strict with
+ * scores: missing or non-numeric entries are dropped and the coverage
+ * flag on the aggregate flips to false.
+ */
+export declare function parseJudgeResponse(content: string, rubric: RubricDoc): JudgeSample;
+/** Run the judge against an artifact and return per-sample + aggregate data. */
+export declare function runJudge(options: RunJudgeOptions): Promise<JudgeInvocation>;
+/**
+ * Convert a JudgeInvocation into VerifierResult[] for the runner. One
+ * result per rubric check (score 0..1 normalized from the 1..5 median) +
+ * one "coverage" result that flips to `ok:false` when any sample failed
+ * to emit a score for a check.
+ */
+export declare function judgeResultsToVerifiers(rubric: RubricDoc, invocation: JudgeInvocation, config: Pick<ResolvedEvalConfig, "regression">, caseHint?: JudgeExpected): VerifierResult[];