npm - cclaw-cli - Versions diffs - 0.25.0 → 0.27.0 - Mend

cclaw-cli 0.25.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/dist/cli.d.ts +4 -0
package/dist/cli.js +79 -4
package/dist/eval/agents/with-tools.d.ts +44 -0
package/dist/eval/agents/with-tools.js +261 -0
package/dist/eval/agents/workflow.d.ts +24 -0
package/dist/eval/agents/workflow.js +133 -0
package/dist/eval/config-loader.js +38 -2
package/dist/eval/diff.d.ts +64 -0
package/dist/eval/diff.js +323 -0
package/dist/eval/llm-client.d.ts +10 -0
package/dist/eval/llm-client.js +10 -1
package/dist/eval/report.js +54 -0
package/dist/eval/runner.d.ts +10 -1
package/dist/eval/runner.js +285 -20
package/dist/eval/sandbox.d.ts +38 -0
package/dist/eval/sandbox.js +137 -0
package/dist/eval/tools/glob.d.ts +2 -0
package/dist/eval/tools/glob.js +163 -0
package/dist/eval/tools/grep.d.ts +2 -0
package/dist/eval/tools/grep.js +152 -0
package/dist/eval/tools/index.d.ts +7 -0
package/dist/eval/tools/index.js +35 -0
package/dist/eval/tools/read.d.ts +2 -0
package/dist/eval/tools/read.js +122 -0
package/dist/eval/tools/types.d.ts +49 -0
package/dist/eval/tools/types.js +41 -0
package/dist/eval/tools/write.d.ts +2 -0
package/dist/eval/tools/write.js +92 -0
package/dist/eval/types.d.ts +152 -1
package/dist/eval/types.js +21 -1
package/dist/eval/verifiers/workflow-consistency.d.ts +21 -0
package/dist/eval/verifiers/workflow-consistency.js +225 -0
package/dist/eval/workflow-corpus.d.ts +7 -0
package/dist/eval/workflow-corpus.js +207 -0
package/package.json +1 -1

package/dist/eval/runner.js CHANGED Viewed

@@ -2,8 +2,11 @@ import { randomUUID } from "node:crypto";
 import { CCLAW_VERSION } from "../constants.js";
 import { FLOW_STAGES } from "../types.js";
 import { runSingleShot } from "./agents/single-shot.js";
+import { MaxTurnsExceededError, runWithTools } from "./agents/with-tools.js";
+import { runWorkflow } from "./agents/workflow.js";
 import { compareAgainstBaselines, loadBaselinesByStage } from "./baseline.js";
 import { loadCorpus, readExtraFixtures, readFixtureArtifact } from "./corpus.js";
+import { loadWorkflowCorpus } from "./workflow-corpus.js";
 import { loadEvalConfig } from "./config-loader.js";
 import { createCostGuard, DailyCostCapExceededError } from "./cost-guard.js";
 import { createEvalClient, EvalLlmError } from "./llm-client.js";
@@ -12,6 +15,7 @@ import { judgeResultsToVerifiers, runJudge } from "./verifiers/judge.js";
 import { verifyRules } from "./verifiers/rules.js";
 import { verifyStructural } from "./verifiers/structural.js";
 import { verifyTraceability } from "./verifiers/traceability.js";
+import { verifyWorkflowConsistency } from "./verifiers/workflow-consistency.js";
 function groupByStage(cases) {
     return cases.reduce((acc, item) => {
         acc[item.stage] = (acc[item.stage] ?? 0) + 1;
@@ -39,8 +43,15 @@ function resolveRunFlags(options) {
     const rulesRequested = options.rules === true;
     const schemaOnly = options.schemaOnly === true;
     const judgeRequested = options.judge === true;
+    const tier = options.tier ?? "A";
     const runJudge = judgeRequested && !schemaOnly;
-    const runAgent = runJudge && (options.tier ?? "A") === "A";
+    // Tier C always needs the agent loop (no fixture fallback for workflows),
+    // so we still require an LLM client but we do NOT require --judge on the
+    // CLI to produce a workflow run. The judge piece itself stays gated by
+    // `runJudge` so consistency-only runs are cheap and deterministic.
+    const runAgent = tier === "C"
+        ? !schemaOnly
+        : runJudge && (tier === "A" || tier === "B");
     return {
         runStructural: true,
         runRules: rulesRequested && !schemaOnly,
@@ -81,6 +92,184 @@ async function loadArtifactOrRecord(projectRoot, caseEntry, verifierResults) {
         return undefined;
     }
 }
+function stageJudgeHint(step) {
+    const hint = {};
+    if (step.rubric)
+        hint.rubric = step.rubric;
+    if (step.requiredChecks)
+        hint.requiredChecks = step.requiredChecks;
+    if (step.minimumScores)
+        hint.minimumScores = step.minimumScores;
+    return hint;
+}
+async function runWorkflowCase(ctx) {
+    const { projectRoot, workflow, plannedTier, flags, config, client, rubrics } = ctx;
+    const started = Date.now();
+    const verifierResults = [];
+    let caseCostUsd = 0;
+    const lastStage = workflow.stages[workflow.stages.length - 1]?.name ??
+        "plan";
+    if (!flags.runAgent || !client) {
+        verifierResults.push({
+            kind: "workflow",
+            id: "workflow:agent:disabled",
+            ok: false,
+            score: 0,
+            message: "Tier C requires the with-tools agent (CCLAW_EVAL_API_KEY or injected client). " +
+                "Re-run with credentials to execute the workflow.",
+            details: { stages: workflow.stages.map((s) => s.name) }
+        });
+        return {
+            caseId: workflow.id,
+            stage: lastStage,
+            tier: plannedTier,
+            passed: false,
+            durationMs: Date.now() - started,
+            verifierResults
+        };
+    }
+    let workflowResult;
+    try {
+        workflowResult = await runWorkflow({
+            workflow,
+            config,
+            projectRoot,
+            client
+        });
+    }
+    catch (err) {
+        if (err instanceof DailyCostCapExceededError)
+            throw err;
+        const retryable = err instanceof EvalLlmError ? err.retryable : false;
+        const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
+        verifierResults.push({
+            kind: "workflow",
+            id: "workflow:agent:error",
+            ok: false,
+            score: 0,
+            message: err instanceof Error ? err.message : String(err),
+            details: {
+                retryable,
+                ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
+            }
+        });
+        return {
+            caseId: workflow.id,
+            stage: lastStage,
+            tier: plannedTier,
+            passed: false,
+            durationMs: Date.now() - started,
+            verifierResults
+        };
+    }
+    caseCostUsd += workflowResult.totalUsageUsd;
+    const stageResults = [...workflowResult.stages];
+    verifierResults.push({
+        kind: "workflow",
+        id: "workflow:agent",
+        ok: true,
+        score: 1,
+        message: `workflow ran ${stageResults.length} stage(s) in ` +
+            `${workflowResult.totalDurationMs}ms ` +
+            `(spent $${workflowResult.totalUsageUsd.toFixed(6)})`,
+        details: {
+            stages: stageResults.map((s) => ({
+                name: s.stage,
+                durationMs: s.durationMs,
+                usageUsd: s.usageUsd,
+                turns: s.toolUse.turns,
+                calls: s.toolUse.calls
+            }))
+        }
+    });
+    let allJudgeOk = true;
+    if (flags.runJudge) {
+        for (let i = 0; i < workflow.stages.length; i += 1) {
+            const step = workflow.stages[i];
+            const stageResult = stageResults[i];
+            const rubric = rubrics.get(step.name);
+            if (!rubric) {
+                verifierResults.push({
+                    kind: "judge",
+                    id: `judge:rubric:missing:${step.name}`,
+                    ok: false,
+                    score: 0,
+                    message: `No rubric at .cclaw/evals/rubrics/${step.name}.yaml.`,
+                    details: { stage: step.name }
+                });
+                allJudgeOk = false;
+                stageResult.judgeOk = false;
+                continue;
+            }
+            const hint = stageJudgeHint(step);
+            try {
+                const invocation = await runJudge({
+                    artifact: stageResult.artifact,
+                    rubric,
+                    config,
+                    client,
+                    caseHint: hint
+                });
+                caseCostUsd += invocation.usageUsd;
+                const judgeVerifiers = judgeResultsToVerifiers(rubric, invocation, config, hint);
+                const medians = {};
+                for (const agg of invocation.aggregates) {
+                    medians[agg.checkId] = agg.median;
+                }
+                stageResult.judgeMedians = medians;
+                const stageOk = judgeVerifiers.every((v) => v.ok);
+                stageResult.judgeOk = stageOk;
+                if (!stageOk)
+                    allJudgeOk = false;
+                for (const v of judgeVerifiers) {
+                    verifierResults.push({
+                        ...v,
+                        id: `${v.id}:${step.name}`,
+                        details: { ...(v.details ?? {}), stage: step.name }
+                    });
+                }
+            }
+            catch (err) {
+                if (err instanceof DailyCostCapExceededError)
+                    throw err;
+                const retryable = err instanceof EvalLlmError ? err.retryable : false;
+                verifierResults.push({
+                    kind: "judge",
+                    id: `judge:invocation:error:${step.name}`,
+                    ok: false,
+                    score: 0,
+                    message: err instanceof Error ? err.message : String(err),
+                    details: { retryable, rubricId: rubric.id, stage: step.name }
+                });
+                stageResult.judgeOk = false;
+                allJudgeOk = false;
+            }
+        }
+    }
+    const consistencyResults = verifyWorkflowConsistency(workflowResult.artifacts, workflow.consistency);
+    verifierResults.push(...consistencyResults);
+    const nonSkipped = verifierResults.filter((r) => r.details?.skipped !== true);
+    const allOk = nonSkipped.length === 0
+        ? verifierResults.every((r) => r.ok)
+        : nonSkipped.every((r) => r.ok);
+    const workflowSummary = {
+        caseId: workflow.id,
+        stages: stageResults,
+        totalUsageUsd: workflowResult.totalUsageUsd,
+        totalDurationMs: workflowResult.totalDurationMs,
+        allJudgeOk: flags.runJudge ? allJudgeOk : true
+    };
+    return {
+        caseId: workflow.id,
+        stage: lastStage,
+        tier: plannedTier,
+        passed: allOk,
+        durationMs: Date.now() - started,
+        costUsd: caseCostUsd > 0 ? Number(caseCostUsd.toFixed(6)) : undefined,
+        verifierResults,
+        workflow: workflowSummary
+    };
+}
 async function runCase(ctx) {
     const { projectRoot, caseEntry, plannedTier, flags, config, client, costGuard, rubrics } = ctx;
     const started = Date.now();
@@ -94,7 +283,7 @@ async function runCase(ctx) {
     const needsArtifact = hasStructural || hasRules || hasTraceability || judgeRequested;
     let artifact;
     if (needsArtifact) {
-        if (flags.runAgent && judgeRequested && client) {
+        if (flags.runAgent && judgeRequested && client && plannedTier === "A") {
             try {
                 const produced = await runSingleShot({
                     caseEntry,
@@ -133,6 +322,52 @@ async function runCase(ctx) {
                 });
             }
         }
+        else if (flags.runAgent && judgeRequested && client && plannedTier === "B") {
+            try {
+                const produced = await runWithTools({
+                    caseEntry,
+                    config,
+                    projectRoot,
+                    client
+                });
+                artifact = produced.artifact;
+                caseCostUsd += produced.usageUsd;
+                verifierResults.push({
+                    kind: "workflow",
+                    id: "agent:with-tools",
+                    ok: true,
+                    score: 1,
+                    message: `with-tools agent produced ${produced.artifact.length} char(s) in ` +
+                        `${produced.durationMs}ms across ${produced.toolUse.turns} turn(s) ` +
+                        `(${produced.toolUse.calls} tool call(s))`,
+                    details: {
+                        model: produced.model,
+                        tokensIn: produced.usage.promptTokens,
+                        tokensOut: produced.usage.completionTokens,
+                        usageUsd: produced.usageUsd,
+                        attempts: produced.attempts,
+                        toolUse: produced.toolUse
+                    }
+                });
+            }
+            catch (err) {
+                if (err instanceof DailyCostCapExceededError)
+                    throw err;
+                const retryable = err instanceof EvalLlmError ? err.retryable : false;
+                const maxTurns = err instanceof MaxTurnsExceededError ? err.turns : undefined;
+                verifierResults.push({
+                    kind: "workflow",
+                    id: "agent:with-tools",
+                    ok: false,
+                    score: 0,
+                    message: err instanceof Error ? err.message : String(err),
+                    details: {
+                        retryable,
+                        ...(maxTurns !== undefined ? { maxTurnsExceeded: maxTurns } : {})
+                    }
+                });
+            }
+        }
         else {
             artifact = await loadArtifactOrRecord(projectRoot, caseEntry, verifierResults);
         }
@@ -279,18 +514,22 @@ function stagesInResults(caseResults) {
  */
 export async function runEval(options) {
     const config = await loadEvalConfig(options.projectRoot, options.env ?? process.env);
-    const corpus = await loadCorpus(options.projectRoot, options.stage);
     const plannedTier = options.tier ?? config.defaultTier;
+    const corpus = plannedTier === "C" ? [] : await loadCorpus(options.projectRoot, options.stage);
+    const workflowCorpus = plannedTier === "C" ? await loadWorkflowCorpus(options.projectRoot) : [];
     const notes = [];
-    if (corpus.length === 0) {
+    if (plannedTier !== "C" && corpus.length === 0) {
         notes.push("Corpus is empty. Seed cases live under `.cclaw/evals/corpus/<stage>/*.yaml`.");
     }
+    if (plannedTier === "C" && workflowCorpus.length === 0) {
+        notes.push("Workflow corpus is empty. Tier C cases live under `.cclaw/evals/corpus/workflows/*.yaml`.");
+    }
     const flags = resolveRunFlags(options);
     if (flags.runJudge && !config.apiKey && !options.llmClient) {
         notes.push("--judge requires CCLAW_EVAL_API_KEY (or an injected client for tests); judge pipeline will report errors per case.");
     }
-    if ((options.tier ?? "A") !== "A" && flags.runJudge) {
-        notes.push("Tier B/C agent-under-test is not wired yet; --judge will score the committed fixture as a stand-in.");
+    if (plannedTier === "C" && !config.apiKey && !options.llmClient) {
+        notes.push("Tier C requires CCLAW_EVAL_API_KEY (or an injected client for tests); workflow runs will fail per case without one.");
     }
     if (options.dryRun === true) {
         const summary = {
@@ -301,12 +540,20 @@ export async function runEval(options) {
                 byStage: groupByStage(corpus),
                 cases: corpus.map((item) => ({ id: item.id, stage: item.stage }))
             },
+            workflowCorpus: {
+                total: workflowCorpus.length,
+                cases: workflowCorpus.map((item) => ({
+                    id: item.id,
+                    stages: item.stages.map((s) => s.name)
+                }))
+            },
             plannedTier,
             verifiersAvailable: {
                 structural: flags.runStructural,
                 rules: flags.runRules,
                 judge: flags.runJudge,
-                workflow: flags.runAgent
+                workflow: flags.runAgent,
+                consistency: plannedTier === "C"
             },
             notes
         };
@@ -314,26 +561,44 @@ export async function runEval(options) {
     }
     const costGuard = createCostGuard(options.projectRoot, config);
     let wrappedClient;
-    if (flags.runJudge) {
+    const clientNeeded = flags.runJudge || plannedTier === "C";
+    if (clientNeeded) {
         const base = options.llmClient ?? createEvalClient(config);
         wrappedClient = wrapClientWithCostGuard(base, costGuard, config.judgeModel ?? config.model);
     }
-    const rubrics = flags.runJudge
+    const rubricsNeeded = flags.runJudge;
+    const rubrics = rubricsNeeded
         ? await loadAllRubrics(options.projectRoot)
         : new Map();
     const now = new Date().toISOString();
     const caseResults = [];
-    for (const item of corpus) {
-        caseResults.push(await runCase({
-            projectRoot: options.projectRoot,
-            caseEntry: item,
-            plannedTier,
-            flags,
-            config,
-            client: wrappedClient,
-            costGuard,
-            rubrics
-        }));
+    if (plannedTier === "C") {
+        for (const wf of workflowCorpus) {
+            caseResults.push(await runWorkflowCase({
+                projectRoot: options.projectRoot,
+                workflow: wf,
+                plannedTier,
+                flags,
+                config,
+                client: wrappedClient,
+                costGuard,
+                rubrics
+            }));
+        }
+    }
+    else {
+        for (const item of corpus) {
+            caseResults.push(await runCase({
+                projectRoot: options.projectRoot,
+                caseEntry: item,
+                plannedTier,
+                flags,
+                config,
+                client: wrappedClient,
+                costGuard,
+                rubrics
+            }));
+        }
     }
     const stages = stagesInResults(caseResults);
     const baselines = await loadBaselinesByStage(options.projectRoot, stages);

package/dist/eval/sandbox.d.ts ADDED Viewed

@@ -0,0 +1,38 @@
+export declare class SandboxEscapeError extends Error {
+    readonly requestedPath: string;
+    constructor(requestedPath: string, reason: string);
+}
+export interface SandboxOptions {
+    /** Project root that `contextFiles` are resolved against. */
+    projectRoot: string;
+    /** Case-relative paths to copy into the sandbox before the agent starts. */
+    contextFiles?: string[];
+    /**
+     * Base directory that will host the per-case tmpdir. Defaults to
+     * `os.tmpdir()`. Tests inject a repo-local path so CI leaves no
+     * traces in `/tmp` when assertions fail.
+     */
+    baseDir?: string;
+    /** Override the per-case suffix. Primarily for deterministic tests. */
+    idOverride?: string;
+}
+export interface Sandbox {
+    /** Absolute path to the sandbox root directory. */
+    root: string;
+    /**
+     * Resolve `requested` relative to the sandbox root and return the
+     * absolute, realpath'd filesystem path. Throws
+     * `SandboxEscapeError` when the resolution crosses the boundary.
+     *
+     * `allowMissing: true` lets callers pre-resolve a destination for a
+     * write where the final component doesn't exist yet — the parent
+     * directory is realpath'd to still catch symlink escapes.
+     */
+    resolve(requested: string, options?: {
+        allowMissing?: boolean;
+    }): Promise<string>;
+    /** Remove the sandbox directory. Idempotent. */
+    dispose(): Promise<void>;
+}
+/** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
+export declare function createSandbox(options: SandboxOptions): Promise<Sandbox>;

package/dist/eval/sandbox.js ADDED Viewed

@@ -0,0 +1,137 @@
+/**
+ * Per-case sandbox for the Tier B with-tools agent.
+ *
+ * Every case gets its own `os.tmpdir()/cclaw-eval-<uuid>/` directory. Any
+ * `contextFiles` the case declares are copied in relative to the project
+ * root, and every tool invocation resolves paths against the sandbox
+ * root with a defensive check that refuses symlinks and `..` escapes.
+ *
+ * Design notes:
+ *
+ * - The sandbox is intentionally tiny (one directory, no symlink
+ *   creation, no executable bits). We rely on `fs.realpath` on every
+ *   resolved path so hostile tool output that creates a symlink to
+ *   `/etc/passwd` and then tries to read it still trips the boundary
+ *   check.
+ * - Cleanup is handled by `dispose()`; callers (runner, tests) must
+ *   invoke it in a `try/finally` so leftover temp directories never
+ *   accumulate.
+ * - The sandbox does not preserve the project's directory structure
+ *   verbatim. Each entry in `contextFiles` is copied flat into
+ *   `sandboxRoot/<basename>` unless it contains path separators, in
+ *   which case the full relative layout is recreated. That keeps demo
+ *   cases portable while still letting richer cases place files under
+ *   subdirectories (e.g. `.cclaw/skills/brainstorming/SKILL.md`).
+ */
+import { randomUUID } from "node:crypto";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+export class SandboxEscapeError extends Error {
+    requestedPath;
+    constructor(requestedPath, reason) {
+        super(`Sandbox refused path "${requestedPath}": ${reason}.`);
+        this.name = "SandboxEscapeError";
+        this.requestedPath = requestedPath;
+    }
+}
+/** Create and prep a fresh sandbox. Callers own cleanup via `dispose()`. */
+export async function createSandbox(options) {
+    const baseDir = options.baseDir ?? os.tmpdir();
+    const id = options.idOverride ?? randomUUID();
+    const root = path.join(baseDir, `cclaw-eval-${id}`);
+    await fs.mkdir(root, { recursive: true });
+    const realRoot = await fs.realpath(root);
+    if (options.contextFiles && options.contextFiles.length > 0) {
+        for (const rel of options.contextFiles) {
+            await copyContextFile(options.projectRoot, realRoot, rel);
+        }
+    }
+    async function resolveInside(requested, opts = {}) {
+        if (typeof requested !== "string" || requested.length === 0) {
+            throw new SandboxEscapeError(String(requested), "path must be a non-empty string");
+        }
+        if (path.isAbsolute(requested)) {
+            throw new SandboxEscapeError(requested, "absolute paths are not allowed");
+        }
+        if (requested.includes("\0")) {
+            throw new SandboxEscapeError(requested, "NUL byte in path");
+        }
+        const joined = path.resolve(realRoot, requested);
+        const relative = path.relative(realRoot, joined);
+        if (relative.startsWith("..") || path.isAbsolute(relative)) {
+            throw new SandboxEscapeError(requested, "resolves outside the sandbox");
+        }
+        let finalPath;
+        try {
+            finalPath = await fs.realpath(joined);
+        }
+        catch (err) {
+            if (!opts.allowMissing) {
+                throw new SandboxEscapeError(requested, `realpath failed: ${err.message}`);
+            }
+            const existingAncestor = await findExistingAncestor(joined, realRoot);
+            if (!existingAncestor) {
+                throw new SandboxEscapeError(requested, "no existing ancestor inside the sandbox");
+            }
+            const ancestorRel = path.relative(realRoot, existingAncestor.real);
+            if (ancestorRel.startsWith("..") || path.isAbsolute(ancestorRel)) {
+                throw new SandboxEscapeError(requested, "parent resolves outside the sandbox");
+            }
+            finalPath = path.join(existingAncestor.real, existingAncestor.trailing);
+        }
+        const finalRel = path.relative(realRoot, finalPath);
+        if (finalRel.startsWith("..") || path.isAbsolute(finalRel)) {
+            throw new SandboxEscapeError(requested, "realpath escapes the sandbox");
+        }
+        return finalPath;
+    }
+    return {
+        root: realRoot,
+        resolve: resolveInside,
+        async dispose() {
+            await fs.rm(realRoot, { recursive: true, force: true });
+        }
+    };
+}
+async function findExistingAncestor(target, stopAt) {
+    const segments = [];
+    let current = target;
+    while (true) {
+        try {
+            const real = await fs.realpath(current);
+            return { real, trailing: path.join(...segments.reverse()) };
+        }
+        catch {
+            const parent = path.dirname(current);
+            if (parent === current)
+                return undefined;
+            segments.push(path.basename(current));
+            if (path.relative(stopAt, parent).startsWith(".."))
+                return undefined;
+            current = parent;
+        }
+    }
+}
+async function copyContextFile(projectRoot, sandboxRoot, relPath) {
+    if (path.isAbsolute(relPath)) {
+        throw new Error(`context_files must be project-relative: ${relPath}`);
+    }
+    const src = path.resolve(projectRoot, relPath);
+    const srcReal = await fs.realpath(src);
+    const projectReal = await fs.realpath(projectRoot);
+    const inside = path.relative(projectReal, srcReal);
+    if (inside.startsWith("..") || path.isAbsolute(inside)) {
+        throw new Error(`context_files entry resolves outside the project: ${relPath}`);
+    }
+    const stat = await fs.stat(srcReal);
+    if (stat.isDirectory()) {
+        const dest = path.join(sandboxRoot, relPath);
+        await fs.mkdir(dest, { recursive: true });
+        await fs.cp(srcReal, dest, { recursive: true });
+        return;
+    }
+    const dest = path.join(sandboxRoot, relPath);
+    await fs.mkdir(path.dirname(dest), { recursive: true });
+    await fs.copyFile(srcReal, dest);
+}

package/dist/eval/tools/glob.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { type SandboxTool } from "./types.js";
2	+ export declare const globTool: SandboxTool;