npm - cclaw-cli - Versions diffs - 0.26.0 → 0.27.0 - Mend

cclaw-cli 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/cli.d.ts +4 -0
package/dist/cli.js +78 -4
package/dist/eval/agents/with-tools.d.ts +14 -1
package/dist/eval/agents/with-tools.js +17 -11
package/dist/eval/agents/workflow.d.ts +24 -0
package/dist/eval/agents/workflow.js +133 -0
package/dist/eval/config-loader.js +6 -2
package/dist/eval/diff.d.ts +64 -0
package/dist/eval/diff.js +323 -0
package/dist/eval/report.js +35 -0
package/dist/eval/runner.d.ts +10 -1
package/dist/eval/runner.js +236 -19
package/dist/eval/types.d.ts +117 -1
package/dist/eval/types.js +21 -1
package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
package/dist/eval/verifiers/workflow-consistency.js +225 -0
package/dist/eval/workflow-corpus.d.ts +7 -0
package/dist/eval/workflow-corpus.js +207 -0
package/package.json +1 -1

package/dist/eval/runner.js CHANGED Viewed

@@ -3,8 +3,10 @@ import { CCLAW_VERSION } from "../constants.js";
 import { FLOW_STAGES } from "../types.js";
 import { runSingleShot } from "./agents/single-shot.js";
 import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
+import { runWorkflow } from "./agents/workflow.js";
 import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
 import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
+import { loadWorkflowCorpus } from "./workflow-corpus.js";
 import { loadEvalConfig } from "./config-loader.js";
 import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
 import { createEvalClient, EvalLlmError } from "./llm-client.js";
@@ -13,6 +15,7 @@ import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
 import { verifyRules } from "./verifiers/rules.js";
 import { verifyStructural } from "./verifiers/structural.js";
 import { verifyTraceability } from "./verifiers/traceability.js";
+import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
 function groupByStage(cases) {
     return cases.reduce((acc, item) => {
         acc[item.stage] = (acc[item.stage] ?? 0) + 1;
@@ -42,7 +45,13 @@ function resolveRunFlags(options) {
     const judgeRequested = options.judge === true;
     const tier = options.tier ?? "A";
     const runJudge = judgeRequested && !schemaOnly;
-    const runAgent = runJudge && (tier === "A" || tier === "B");
+    // Tier C always needs the agent loop (no fixture fallback for workflows),
+    // so we still require an LLM client but we do NOT require --judge on the
+    // CLI to produce a workflow run. The judge piece itself stays gated by
+    // `runJudge` so consistency-only runs are cheap and deterministic.
+    const runAgent = tier === "C"
+        ? !schemaOnly
+        : runJudge && (tier === "A" || tier === "B");
     return {
         runStructural: true,
         runRules: rulesRequested && !schemaOnly,
@@ -83,6 +92,184 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
         return undefined;
     }
 }
+function stageJudgeHint(step) {
+    const hint = {};
+    if (step.rubric)
+        hint.rubric = step.rubric;
+    if (step.requiredChecks)
+        hint.requiredChecks = step.requiredChecks;
+    if (step.minimumScores)
+        hint.minimumScores = step.minimumScores;
+    return hint;
+}
+async function runWorkflowCase(ctx) {
+    const { projectRoot, workflow, plannedTier, flags, config, client, rubrics } = ctx;
+    const started = Date.now();
+    const verifierResults = [];
+    let caseCostUsd = 0;
+    const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
+        "plan";
+    if (!flags.runAgent || !client) {
+        verifierResults.push({
+            kind: "workflow",
+            id: "workflow:agent:disabled",
+            ok: false,
+            score: 0,
+            message: "Tier C requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
+                "Re-run with credentials to execute the workflow.",
+            details: { stages: workflow.stages.map((s) => s.name) }
+        });
+        return {
+            caseId: workflow.id,
+            stage: lastStage,
+            tier: plannedTier,
+            passed: false,
+            durationMs: Date.now() - started,
+            verifierResults
+        };
+    }
+    let workflowResult;
+    try {
+        workflowResult = await runWorkflow({
+            workflow,
+            config,
+            projectRoot,
+            client
+        });
+    }
+    catch (err) {
+        if (err instanceof DailyCostCapExceededError)
+            throw err;
+        const retryable = err instanceof EvalLlmError ? err.retryable : false;
+        const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
+        verifierResults.push({
+            kind: "workflow",
+            id: "workflow:agent:error",
+            ok: false,
+            score: 0,
+            message: err instanceof Error ? err.message : String(err),
+            details: {
+                retryable,
+                ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
+            }
+        });
+        return {
+            caseId: workflow.id,
+            stage: lastStage,
+            tier: plannedTier,
+            passed: false,
+            durationMs: Date.now() - started,
+            verifierResults
+        };
+    }
+    caseCostUsd += workflowResult.totalUsageUsd;
+    const stageResults = [...workflowResult.stages];
+    verifierResults.push({
+        kind: "workflow",
+        id: "workflow:agent",
+        ok: true,
+        score: 1,
+        message: `workflow ran ${stageResults.length} stage(s) in ` +
+            `${workflowResult.totalDurationMs}ms ` +
+            `(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
+        details: {
+            stages: stageResults.map((s) => ({
+                name: s.stage,
+                durationMs: s.durationMs,
+                usageUsd: s.usageUsd,
+                turns: s.toolUse.turns,
+                calls: s.toolUse.calls
+            }))
+        }
+    });
+    let allJudgeOk = true;
+    if (flags.runJudge) {
+        for (let i = 0; i < workflow.stages.length; i += 1) {
+            const step = workflow.stages[i];
+            const stageResult = stageResults[i];
+            const rubric = rubrics.get(step.name);
+            if (!rubric) {
+                verifierResults.push({
+                    kind: "judge",
+                    id: `judge:rubric:missing:${step.name}`,
+                    ok: false,
+                    score: 0,
+                    message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
+                    details: { stage: step.name }
+                });
+                allJudgeOk = false;
+                stageResult.judgeOk = false;
+                continue;
+            }
+            const hint = stageJudgeHint(step);
+            try {
+                const invocation = await runJudge({
+                    artifact: stageResult.artifact,
+                    rubric,
+                    config,
+                    client,
+                    caseHint: hint
+                });
+                caseCostUsd += invocation.usageUsd;
+                const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
+                const medians = {};
+                for (const agg of invocation.aggregates) {
+                    medians[agg.checkId] = agg.median;
+                }
+                stageResult.judgeMedians = medians;
+                const stageOk = judgeVerifiers.every((v) => v.ok);
+                stageResult.judgeOk = stageOk;
+                if (!stageOk)
+                    allJudgeOk = false;
+                for (const v of judgeVerifiers) {
+                    verifierResults.push({
+                        ...v,
+                        id: `${v.id}:${step.name}`,
+                        details: { ...(v.details ?? {}), stage: step.name }
+                    });
+                }
+            }
+            catch (err) {
+                if (err instanceof DailyCostCapExceededError)
+                    throw err;
+                const retryable = err instanceof EvalLlmError ? err.retryable : false;
+                verifierResults.push({
+                    kind: "judge",
+                    id: `judge:invocation:error:${step.name}`,
+                    ok: false,
+                    score: 0,
+                    message: err instanceof Error ? err.message : String(err),
+                    details: { retryable, rubricId: rubric.id, stage: step.name }
+                });
+                stageResult.judgeOk = false;
+                allJudgeOk = false;
+            }
+        }
+    }
+    const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
+    verifierResults.push(...consistencyResults);
+    const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
+    const allOk = nonSkipped.length === 0
+        ? verifierResults.every((r) => r.ok)
+        : nonSkipped.every((r) => r.ok);
+    const workflowSummary = {
+        caseId: workflow.id,
+        stages: stageResults,
+        totalUsageUsd: workflowResult.totalUsageUsd,
+        totalDurationMs: workflowResult.totalDurationMs,
+        allJudgeOk: flags.runJudge ? allJudgeOk : true
+    };
+    return {
+        caseId: workflow.id,
+        stage: lastStage,
+        tier: plannedTier,
+        passed: allOk,
+        durationMs: Date.now() - started,
+        costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
+        verifierResults,
+        workflow: workflowSummary
+    };
+}
 async function runCase(ctx) {
     const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
     const started = Date.now();
@@ -327,18 +514,22 @@ function stagesInResults(caseResults) {
  */
 export async function runEval(options) {
     const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
-    const corpus = await loadCorpus(options.projectRoot, options.stage);
     const plannedTier = options.tier ?? config.defaultTier;
+    const corpus = plannedTier === "C" ? [] : await loadCorpus(options.projectRoot, options.stage);
+    const workflowCorpus = plannedTier === "C" ? await loadWorkflowCorpus(options.projectRoot) : [];
     const notes = [];
-    if (corpus.length === 0) {
+    if (plannedTier !== "C" && corpus.length === 0) {
         notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
     }
+    if (plannedTier === "C" && workflowCorpus.length === 0) {
+        notes.push("Workflow corpus is empty. Tier C cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
+    }
     const flags = resolveRunFlags(options);
     if (flags.runJudge && !config.apiKey && !options.llmClient) {
         notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
     }
-    if ((options.tier ?? "A") !== "A" && flags.runJudge) {
-        notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
+    if (plannedTier === "C" && !config.apiKey && !options.llmClient) {
+        notes.push("Tier C requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
     }
     if (options.dryRun === true) {
         const summary = {
@@ -349,12 +540,20 @@ export async function runEval(options) {
                 byStage: groupByStage(corpus),
                 cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
             },
+            workflowCorpus: {
+                total: workflowCorpus.length,
+                cases: workflowCorpus.map((item) => ({
+                    id: item.id,
+                    stages: item.stages.map((s) => s.name)
+                }))
+            },
             plannedTier,
             verifiersAvailable: {
                 structural: flags.runStructural,
                 rules: flags.runRules,
                 judge: flags.runJudge,
-                workflow: flags.runAgent
+                workflow: flags.runAgent,
+                consistency: plannedTier === "C"
             },
             notes
         };
@@ -362,26 +561,44 @@ export async function runEval(options) {
     }
     const costGuard = createCostGuard(options.projectRoot, config);
     let wrappedClient;
-    if (flags.runJudge) {
+    const clientNeeded = flags.runJudge || plannedTier === "C";
+    if (clientNeeded) {
         const base = options.llmClient ?? createEvalClient(config);
         wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
     }
-    const rubrics = flags.runJudge
+    const rubricsNeeded = flags.runJudge;
+    const rubrics = rubricsNeeded
         ? await loadAllRubrics(options.projectRoot)
         : new Map();
     const now = new Date().toISOString();
     const caseResults = [];
-    for (const item of corpus) {
-        caseResults.push(await runCase({
-            projectRoot: options.projectRoot,
-            caseEntry: item,
-            plannedTier,
-            flags,
-            config,
-            client: wrappedClient,
-            costGuard,
-            rubrics
-        }));
+    if (plannedTier === "C") {
+        for (const wf of workflowCorpus) {
+            caseResults.push(await runWorkflowCase({
+                projectRoot: options.projectRoot,
+                workflow: wf,
+                plannedTier,
+                flags,
+                config,
+                client: wrappedClient,
+                costGuard,
+                rubrics
+            }));
+        }
+    }
+    else {
+        for (const item of corpus) {
+            caseResults.push(await runCase({
+                projectRoot: options.projectRoot,
+                caseEntry: item,
+                plannedTier,
+                flags,
+                config,
+                client: wrappedClient,
+                costGuard,
+                rubrics
+            }));
+        }
     }
     const stages = stagesInResults(caseResults);
     const baselines = await loadBaselinesByStage(options.projectRoot, stages);

package/dist/eval/types.d.ts CHANGED Viewed

@@ -24,8 +24,10 @@ export type EvalTier = (typeof EVAL_TIERS)[number];
 /**
  * Verifier kinds, in increasing cost and decreasing determinism:
  * structural and rules run without LLM; judge and workflow use the configured model.
+ * `consistency` is the Tier C cross-artifact family (deterministic but
+ * operates over multiple artifacts at once).
  */
-export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
+export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow", "consistency"];
 export type VerifierKind = (typeof VERIFIER_KINDS)[number];
 /**
  * Structural expectations — deterministic, LLM-free checks against a single
@@ -199,6 +201,11 @@ export interface EvalCaseResult {
     durationMs: number;
     costUsd?: number;
     verifierResults: VerifierResult[];
+    /**
+     * Tier C only: the per-stage breakdown collected by the workflow
+     * agent. Unset for Tier A/B cases so the on-disk JSON stays small.
+     */
+    workflow?: WorkflowRunSummary;
 }
 /** Top-level eval report, serialized to JSON and rendered to Markdown. */
 export interface EvalReport {
@@ -286,6 +293,14 @@ export interface EvalConfig {
      * marker so the model sees the cutoff.
      */
     toolMaxResultBytes?: number;
+    /**
+     * Maximum total turns a single Tier C workflow case may consume
+     * across all stages combined. Defaults to 40 (stages × toolMaxTurns).
+     * Runs that exceed the cap fail the current stage with a
+     * `MaxTurnsExceededError` propagated from the underlying with-tools
+     * loop rather than a dedicated workflow-level error.
+     */
+    workflowMaxTotalTurns?: number;
 }
 /** Per-model pricing schedule, expressed as USD per 1K tokens. */
 export interface TokenPricing {
@@ -416,3 +431,104 @@ export interface ToolUseSummary {
     /** Per-tool call counts, keyed by tool name. */
     byTool: Record<string, number>;
 }
+/**
+ * Cross-stage consistency expectations for a Tier C workflow case. Every
+ * sub-check is optional so authors can opt in incrementally; an empty
+ * block produces zero verifier results.
+ */
+export interface WorkflowConsistencyExpected {
+    /**
+     * For each rule, every id extracted from the `from` stage must appear in
+     * every listed `to` stage. Typical entry: `{ idPattern: "D-\\d+", from:
+     * "scope", to: ["plan"] }`. Guards the "decisions flow downstream" rule.
+     */
+    idsFlow?: Array<{
+        idPattern: string;
+        idFlags?: string;
+        from: WorkflowStageName;
+        to: WorkflowStageName[];
+    }>;
+    /**
+     * Stages that must not contain any of the listed case-insensitive
+     * phrases. Defaults to `["TBD", "TODO", "placeholder"]` when set to an
+     * empty array; omit entirely to skip the check.
+     */
+    placeholderFree?: {
+        stages: WorkflowStageName[];
+        phrases?: string[];
+    };
+    /**
+     * Free-form substring pairs: for every entry, if `must` appears in the
+     * named stage, `forbid` must NOT appear anywhere in the listed
+     * `stages`. Useful for "v1 decided in scope, plan must not say v2".
+     */
+    noContradictions?: Array<{
+        stage: WorkflowStageName;
+        must: string;
+        forbid: string;
+        stages: WorkflowStageName[];
+    }>;
+}
+/**
+ * A single stage step inside a Tier C workflow case. The stage's
+ * `inputPrompt` is handed to the Tier B with-tools agent with prior-stage
+ * artifacts seeded into the sandbox under `stages/<name>.md`.
+ */
+export interface WorkflowStageStep {
+    name: WorkflowStageName;
+    inputPrompt: string;
+    /** Per-stage rubric id override (defaults to the stage name). */
+    rubric?: string;
+    /** Per-stage required rubric check ids (mirror of JudgeExpected.requiredChecks). */
+    requiredChecks?: string[];
+    /** Per-stage minimum rubric scores (mirror of JudgeExpected.minimumScores). */
+    minimumScores?: Record<string, number>;
+}
+/**
+ * Supported workflow stages. Deliberately a subset of `FlowStage` —
+ * Tier C covers the early "design" arc of a project. TDD/review/ship
+ * are out of scope (they require real code execution).
+ */
+export declare const WORKFLOW_STAGES: readonly ["brainstorm", "scope", "design", "spec", "plan"];
+export type WorkflowStageName = (typeof WORKFLOW_STAGES)[number];
+/**
+ * A Tier C workflow case. Lives under
+ * `.cclaw/evals/corpus/workflows/<id>.yaml` and wires a multi-stage run
+ * through the with-tools agent.
+ */
+export interface WorkflowCase {
+    id: string;
+    /** Short human-readable description (rendered in reports). */
+    description?: string;
+    /** Project files seeded into the sandbox before stage 1 runs. */
+    contextFiles?: string[];
+    /** Ordered list of stages to run. Must be non-empty. */
+    stages: WorkflowStageStep[];
+    /** Cross-stage consistency checks (Tier C-specific verifier family). */
+    consistency?: WorkflowConsistencyExpected;
+}
+/** Per-stage record inside a Tier C workflow run. */
+export interface WorkflowStageResult {
+    stage: WorkflowStageName;
+    artifact: string;
+    durationMs: number;
+    usageUsd: number;
+    toolUse: ToolUseSummary;
+    attempts: number;
+    model: string;
+    promptTokens: number;
+    completionTokens: number;
+    /** True when the judge (when requested) produced `ok:true` for every required check. */
+    judgeOk?: boolean;
+    /** Per-rubric-check medians keyed by check id (for the report). */
+    judgeMedians?: Record<string, number>;
+}
+/** Tier C orchestration output collected by the runner. */
+export interface WorkflowRunSummary {
+    caseId: string;
+    stages: WorkflowStageResult[];
+    totalUsageUsd: number;
+    totalDurationMs: number;
+    /** True when every stage judge was ok (or judge was skipped everywhere). */
+    allJudgeOk: boolean;
+}

package/dist/eval/types.js CHANGED Viewed

@@ -11,5 +11,25 @@ export const EVAL_TIERS = ["A", "B", "C"];
 /**
  * Verifier kinds, in increasing cost and decreasing determinism:
  * structural and rules run without LLM; judge and workflow use the configured model.
+ * `consistency` is the Tier C cross-artifact family (deterministic but
+ * operates over multiple artifacts at once).
  */
-export const VERIFIER_KINDS = ["structural", "rules", "judge", "workflow"];
+export const VERIFIER_KINDS = [
+    "structural",
+    "rules",
+    "judge",
+    "workflow",
+    "consistency"
+];
+/**
+ * Supported workflow stages. Deliberately a subset of `FlowStage` —
+ * Tier C covers the early "design" arc of a project. TDD/review/ship
+ * are out of scope (they require real code execution).
+ */
+export const WORKFLOW_STAGES = [
+    "brainstorm",
+    "scope",
+    "design",
+    "spec",
+    "plan"
+];

package/dist/eval/verifiers/workflow-consistency.d.ts ADDED Viewed

@@ -0,0 +1,21 @@
+/**
+ * Cross-artifact consistency verifier for Tier C.
+ *
+ * Operates over a `{ stage → artifact }` map produced by the workflow
+ * agent and emits deterministic verifier results for:
+ *
+ *  - `ids_flow`: every id extracted from `from` must appear in every
+ *    `to` stage. Typical use — `D-\d+` from scope must all land in plan.
+ *  - `placeholder_free`: none of the listed phrases
+ *    (default `TBD`/`TODO`/`placeholder`) appear in any of the named
+ *    stages.
+ *  - `no_contradictions`: for each entry, if `must` is present in the
+ *    declaring stage, `forbid` must not appear in any of the listed
+ *    `stages`.
+ *
+ * Each sub-check contributes zero or more `VerifierResult`s with
+ * `kind: "consistency"`. An empty `WorkflowConsistencyExpected` produces
+ * zero results so authors can opt in incrementally.
+ */
+import type { VerifierResult, WorkflowConsistencyExpected, WorkflowStageName } from "../types.js";
+export declare function verifyWorkflowConsistency(artifacts: Map<WorkflowStageName, string>, expected: WorkflowConsistencyExpected | undefined): VerifierResult[];