npm - cclaw-cli - Versions diffs - 0.21.1 → 0.22.0 - Mend

cclaw-cli 0.21.1 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/dist/cli.d.ts +9 -1
package/dist/cli.js +123 -1
package/dist/constants.d.ts +11 -2
package/dist/constants.js +26 -1
package/dist/content/eval-scaffold.d.ts +11 -0
package/dist/content/eval-scaffold.js +89 -0
package/dist/content/skills.js +1 -1
package/dist/content/stages/brainstorm.js +3 -7
package/dist/content/stages/design.js +2 -5
package/dist/content/stages/plan.js +2 -4
package/dist/content/stages/review.js +2 -4
package/dist/content/stages/schema-types.d.ts +8 -2
package/dist/content/stages/scope.js +2 -6
package/dist/content/stages/ship.js +2 -4
package/dist/content/stages/spec.js +2 -5
package/dist/content/stages/tdd.js +2 -4
package/dist/eval/config-loader.d.ts +14 -0
package/dist/eval/config-loader.js +237 -0
package/dist/eval/corpus.d.ts +8 -0
package/dist/eval/corpus.js +91 -0
package/dist/eval/llm-client.d.ts +62 -0
package/dist/eval/llm-client.js +19 -0
package/dist/eval/report.d.ts +11 -0
package/dist/eval/report.js +88 -0
package/dist/eval/runner.d.ts +53 -0
package/dist/eval/runner.js +96 -0
package/dist/eval/types.d.ts +136 -0
package/dist/eval/types.js +15 -0
package/dist/install.js +22 -0
package/dist/runs.d.ts +0 -18
package/dist/runs.js +1 -188
package/package.json +1 -1

package/dist/eval/types.d.ts ADDED Viewed

@@ -0,0 +1,136 @@
+/**
+ * Core types for the cclaw eval subsystem (Phase 7).
+ *
+ * The eval subsystem lets us measure whether a change to a prompt, skill, or
+ * stage contract improves or regresses the quality of agent output. It is
+ * deliberately decoupled from the main cclaw runtime so that:
+ *
+ * - Users who never run `cclaw eval` pay zero runtime cost.
+ * - The verifier / rubric / LLM stack evolves on its own release cadence (Waves 7.0-7.6).
+ * - Any OpenAI-compatible endpoint can be swapped in via config (z.ai, OpenAI, vLLM, etc.).
+ */
+import type { FlowStage } from "../types.js";
+/**
+ * Fidelity tier for the agent-under-test.
+ *
+ * - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
+ * - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
+ * - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
+ *   artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
+ *   (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
+ */
+export declare const EVAL_TIERS: readonly ["A", "B", "C"];
+export type EvalTier = (typeof EVAL_TIERS)[number];
+/**
+ * Verifier kinds, in increasing cost and decreasing determinism:
+ * structural and rules run without LLM; judge and workflow use the configured model.
+ */
+export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow"];
+export type VerifierKind = (typeof VERIFIER_KINDS)[number];
+/**
+ * A single eval case describes one input scenario for one stage. Cases live in
+ * `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
+ * fixture artifact for verifier development (Wave 7.1) before the agent loop
+ * exists (Wave 7.3+).
+ */
+export interface EvalCase {
+    id: string;
+    stage: FlowStage;
+    inputPrompt: string;
+    /** Project files copied into the Tier B/C sandbox before the agent runs. */
+    contextFiles?: string[];
+    /**
+     * Optional expected-shape hints consumed by structural/rule verifiers.
+     * Left intentionally loose; verifiers in Waves 7.1–7.2 will narrow this.
+     */
+    expected?: Record<string, unknown>;
+    /**
+     * Path (relative to the corpus case file) of a pre-generated artifact used
+     * when verifiers are exercised without a live agent loop. Primarily a Wave
+     * 7.1 development aid.
+     */
+    fixture?: string;
+}
+/** Result of one verifier applied to one case. */
+export interface VerifierResult {
+    kind: VerifierKind;
+    id: string;
+    ok: boolean;
+    /** Normalized 0..1 score when the verifier produces a numeric signal. */
+    score?: number;
+    message?: string;
+    details?: Record<string, unknown>;
+}
+/** Aggregate result for one case after all verifiers run. */
+export interface EvalCaseResult {
+    caseId: string;
+    stage: FlowStage;
+    tier: EvalTier;
+    passed: boolean;
+    durationMs: number;
+    costUsd?: number;
+    verifierResults: VerifierResult[];
+}
+/** Top-level eval report, serialized to JSON and rendered to Markdown. */
+export interface EvalReport {
+    schemaVersion: 1;
+    generatedAt: string;
+    runId: string;
+    cclawVersion: string;
+    provider: string;
+    model: string;
+    tier: EvalTier;
+    stages: FlowStage[];
+    cases: EvalCaseResult[];
+    summary: {
+        totalCases: number;
+        passed: number;
+        failed: number;
+        skipped: number;
+        totalCostUsd: number;
+        totalDurationMs: number;
+    };
+    /** Present when comparing against a saved baseline (Wave 7.1+). */
+    baselineDelta?: {
+        baselineId: string;
+        scoreDelta: number;
+        criticalFailures: number;
+    };
+}
+/**
+ * Eval configuration, persisted to `.cclaw/evals/config.yaml` and mergeable
+ * with `CCLAW_EVAL_*` environment variables at runtime.
+ */
+export interface EvalConfig {
+    /**
+     * Free-form provider name used in reports. The actual HTTP protocol is
+     * determined by `baseUrl`, which is expected to be OpenAI-compatible.
+     */
+    provider: string;
+    /** OpenAI-compatible base URL, e.g. `https://api.z.ai/api/coding/paas/v4`. */
+    baseUrl: string;
+    /** Model identifier for both agent-under-test and judge unless `judgeModel` overrides. */
+    model: string;
+    /** Optional separate model for the judge role. Defaults to `model`. */
+    judgeModel?: string;
+    /** Default tier when `--tier` is not supplied. */
+    defaultTier: EvalTier;
+    /** Optional hard stop on estimated USD spend per day. Unset = no cap. */
+    dailyUsdCap?: number;
+    /** Regression thresholds for CI gates. */
+    regression: {
+        /** Fail when overall score drops by more than this fraction (e.g. 0.15 = 15%). */
+        failIfDeltaBelow: number;
+        /** Fail when any single critical rubric drops below this absolute score. */
+        failIfCriticalBelow: number;
+    };
+    /** Per-agent-run timeout in milliseconds. */
+    timeoutMs: number;
+    /** Max retries per API call on transient failures. */
+    maxRetries: number;
+}
+/** Resolved config with env overrides applied. */
+export interface ResolvedEvalConfig extends EvalConfig {
+    apiKey?: string;
+    source: "default" | "file" | "env" | "file+env";
+}

package/dist/eval/types.js ADDED Viewed

@@ -0,0 +1,15 @@
+/**
+ * Fidelity tier for the agent-under-test.
+ *
+ * - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
+ * - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
+ * - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
+ *   artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
+ *   (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
+ */
+export const EVAL_TIERS = ["A", "B", "C"];
+/**
+ * Verifier kinds, in increasing cost and decreasing determinism:
+ * structural and rules run without LLM; judge and workflow use the configured model.
+ */
+export const VERIFIER_KINDS = ["structural", "rules", "judge", "workflow"];

package/dist/install.js CHANGED Viewed

@@ -28,6 +28,7 @@ import { contextMonitorScript, promptGuardScript, workflowGuardScript } from "./
 import { META_SKILL_NAME, usingCclawSkillMarkdown } from "./content/meta-skill.js";
 import { decisionProtocolMarkdown, completionProtocolMarkdown, ethosProtocolMarkdown } from "./content/protocols.js";
 import { ARTIFACT_TEMPLATES, CURSOR_WORKFLOW_RULE_MDC, RULEBOOK_MARKDOWN, buildRulesJson } from "./content/templates.js";
+import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
 import { TDD_WAVE_WALKTHROUGH_MARKDOWN, stageSkillFolder, stageSkillMarkdown } from "./content/skills.js";
 import { stageCommonGuidanceMarkdown } from "./content/stage-common-guidance.js";
 import { STAGE_EXAMPLES_REFERENCE_DIR, stageExamplesReferenceMarkdown } from "./content/examples.js";
@@ -184,6 +185,26 @@ async function writeArtifactTemplates(projectRoot) {
         await writeFileSafe(runtimePath(projectRoot, "templates", fileName), content);
     }
 }
+/**
+ * Seed the `.cclaw/evals/` scaffold. Only writes files that do not already
+ * exist so that user-authored config.yaml / corpus / rubrics / baselines are
+ * never clobbered by `cclaw sync`.
+ */
+async function writeEvalScaffold(projectRoot) {
+    const targets = [
+        { rel: "evals/config.yaml", content: EVAL_CONFIG_YAML },
+        { rel: "evals/corpus/README.md", content: EVAL_CORPUS_README },
+        { rel: "evals/rubrics/README.md", content: EVAL_RUBRICS_README },
+        { rel: "evals/baselines/README.md", content: EVAL_BASELINES_README },
+        { rel: "evals/reports/README.md", content: EVAL_REPORTS_README }
+    ];
+    for (const target of targets) {
+        const absolute = runtimePath(projectRoot, ...target.rel.split("/"));
+        if (await exists(absolute))
+            continue;
+        await writeFileSafe(absolute, target.content);
+    }
+}
 async function writeSkills(projectRoot, config) {
     for (const stage of COMMAND_FILE_ORDER) {
         const folder = stageSkillFolder(stage);
@@ -1044,6 +1065,7 @@ async function materializeRuntime(projectRoot, config, forceStateReset) {
     await writeSkills(projectRoot, config);
     await writeContextModes(projectRoot);
     await writeArtifactTemplates(projectRoot);
+    await writeEvalScaffold(projectRoot);
     await writeRulebook(projectRoot);
     await writeState(projectRoot, config, forceStateReset);
     await ensureRunSystem(projectRoot, { createIfMissing: false });

package/dist/runs.d.ts CHANGED Viewed

@@ -52,19 +52,6 @@ export interface ArchiveManifest {
     snapshottedStateFiles: string[];
     retro: ArchiveRunResult["retro"];
 }
-export interface RewindRunOptions {
-    to: FlowStage;
-    reason?: string;
-}
-export interface RewindRunResult {
-    rewindId: string;
-    from: FlowStage;
-    to: FlowStage;
-    invalidatedStages: FlowStage[];
-    staleArtifacts: string[];
-    archivePath: string;
-    nextState: FlowState;
-}
 interface EnsureRunSystemOptions {
     createIfMissing?: boolean;
 }
@@ -82,11 +69,6 @@ export declare function writeFlowState(projectRoot: string, state: FlowState, op
 export declare function ensureRunSystem(projectRoot: string, _options?: EnsureRunSystemOptions): Promise<FlowState>;
 export declare function listRuns(projectRoot: string): Promise<CclawRunMeta[]>;
 export declare function archiveRun(projectRoot: string, featureName?: string, options?: ArchiveRunOptions): Promise<ArchiveRunResult>;
-export declare function rewindRun(projectRoot: string, options: RewindRunOptions): Promise<RewindRunResult>;
-export declare function acknowledgeStaleStage(projectRoot: string, stage: FlowStage): Promise<{
-    acknowledged: boolean;
-    remaining: FlowStage[];
-}>;
 /**
  * Counts entries in the canonical JSONL knowledge store. An "active" entry is one
  * non-empty line that parses as JSON with the required `type` field belonging to the

package/dist/runs.js CHANGED Viewed

@@ -1,10 +1,9 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { COMMAND_FILE_ORDER, RUNTIME_ROOT } from "./constants.js";
-import { canTransition, createInitialFlowState, isFlowTrack, skippedStagesForTrack, trackStages } from "./flow-state.js";
+import { canTransition, createInitialFlowState, isFlowTrack, skippedStagesForTrack } from "./flow-state.js";
 import { ensureFeatureSystem, readActiveFeature, syncActiveFeatureSnapshot } from "./feature-system.js";
 import { ensureDir, exists, withDirectoryLock, writeFileSafe } from "./fs-utils.js";
-import { stageSchema } from "./content/stage-schema.js";
 export class InvalidStageTransitionError extends Error {
     from;
     to;
@@ -36,8 +35,6 @@ const FLOW_STATE_REL_PATH = `${RUNTIME_ROOT}/state/flow-state.json`;
 const RUNS_DIR_REL_PATH = `${RUNTIME_ROOT}/runs`;
 const ACTIVE_ARTIFACTS_REL_PATH = `${RUNTIME_ROOT}/artifacts`;
 const STATE_DIR_REL_PATH = `${RUNTIME_ROOT}/state`;
-const REWIND_LOG_REL_PATH = `${RUNTIME_ROOT}/state/rewind-log.jsonl`;
-const REWIND_ARCHIVE_DIR_NAME = "_rewind-archive";
 const FLOW_STAGE_SET = new Set(COMMAND_FILE_ORDER);
 /** State filenames explicitly excluded from the archive snapshot. */
 const STATE_SNAPSHOT_EXCLUDE = new Set([
@@ -59,12 +56,6 @@ function activeArtifactsPath(projectRoot) {
 function stateDirPath(projectRoot) {
     return path.join(projectRoot, STATE_DIR_REL_PATH);
 }
-function rewindLogPath(projectRoot) {
-    return path.join(projectRoot, REWIND_LOG_REL_PATH);
-}
-function rewindArchivePath(projectRoot, rewindId) {
-    return path.join(activeArtifactsPath(projectRoot), REWIND_ARCHIVE_DIR_NAME, rewindId);
-}
 async function snapshotStateDirectory(projectRoot, destinationRoot) {
     const sourceDir = stateDirPath(projectRoot);
     if (!(await exists(sourceDir))) {
@@ -348,23 +339,6 @@ async function uniqueArchiveId(projectRoot, baseId) {
     }
     return candidate;
 }
-function rewindTimestampId(date = new Date()) {
-    return date
-        .toISOString()
-        .replace(/[-:]/gu, "")
-        .replace(/\.\d{3}Z$/u, "Z");
-}
-function staleArtifactFileName(fileName) {
-    const ext = path.extname(fileName);
-    if (!ext) {
-        return `${fileName}.stale`;
-    }
-    const base = fileName.slice(0, -ext.length);
-    return `${base}.stale${ext}`;
-}
-function stageIndexMapForTrack(track) {
-    return new Map(trackStages(track).map((stage, index) => [stage, index]));
-}
 function retroArtifactPath(projectRoot) {
     return path.join(activeArtifactsPath(projectRoot), "09-retro.md");
 }
@@ -620,167 +594,6 @@ export async function archiveRun(projectRoot, featureName, options = {}) {
         retro: retroSummary
     };
 }
-export async function rewindRun(projectRoot, options) {
-    await ensureRunSystem(projectRoot);
-    const state = await readFlowState(projectRoot);
-    const track = state.track ?? "standard";
-    const ordered = trackStages(track);
-    const stageToIndex = stageIndexMapForTrack(track);
-    const toIndex = stageToIndex.get(options.to);
-    const currentIndex = stageToIndex.get(state.currentStage);
-    if (toIndex === undefined) {
-        throw new Error(`Cannot rewind to "${options.to}" because it is outside track "${track}".`);
-    }
-    if (currentIndex === undefined) {
-        throw new Error(`Current stage "${state.currentStage}" is not part of track "${track}".`);
-    }
-    if (toIndex > currentIndex) {
-        throw new Error(`Cannot rewind forward from "${state.currentStage}" to "${options.to}".`);
-    }
-    const reason = options.reason?.trim() && options.reason.trim().length > 0
-        ? options.reason.trim()
-        : "manual_rewind";
-    const nowIso = new Date().toISOString();
-    const rewindId = `rewind-${rewindTimestampId()}`;
-    const invalidatedStages = ordered.filter((stage) => {
-        const idx = stageToIndex.get(stage);
-        if (idx === undefined || idx <= toIndex) {
-            return false;
-        }
-        return state.completedStages.includes(stage) || stage === state.currentStage;
-    });
-    const nextCompletedStages = state.completedStages.filter((stage) => {
-        const idx = stageToIndex.get(stage);
-        return typeof idx === "number" && idx < toIndex;
-    });
-    const freshCatalog = createInitialFlowState({ activeRunId: state.activeRunId, track }).stageGateCatalog;
-    const nextCatalog = { ...state.stageGateCatalog };
-    for (const stage of ordered) {
-        const idx = stageToIndex.get(stage);
-        if (idx === undefined)
-            continue;
-        if (idx >= toIndex) {
-            nextCatalog[stage] = {
-                ...freshCatalog[stage],
-                required: [...freshCatalog[stage].required],
-                recommended: [...freshCatalog[stage].recommended],
-                conditional: [...freshCatalog[stage].conditional],
-                triggered: [],
-                passed: [],
-                blocked: []
-            };
-        }
-    }
-    const nextGuardEvidence = { ...state.guardEvidence };
-    for (const stage of ordered) {
-        const idx = stageToIndex.get(stage);
-        if (idx === undefined || idx < toIndex)
-            continue;
-        const catalog = state.stageGateCatalog[stage];
-        const gateIds = new Set([
-            ...catalog.required,
-            ...catalog.recommended,
-            ...catalog.conditional,
-            ...catalog.triggered,
-            ...catalog.passed,
-            ...catalog.blocked
-        ]);
-        for (const gateId of gateIds) {
-            delete nextGuardEvidence[gateId];
-        }
-    }
-    const nextStale = {};
-    for (const [stage, marker] of Object.entries(state.staleStages)) {
-        if (!marker)
-            continue;
-        const idx = stageToIndex.get(stage);
-        if (idx === undefined || idx <= toIndex) {
-            continue;
-        }
-        nextStale[stage] = marker;
-    }
-    for (const stage of invalidatedStages) {
-        nextStale[stage] = {
-            rewindId,
-            reason,
-            markedAt: nowIso
-        };
-    }
-    const archivePath = rewindArchivePath(projectRoot, rewindId);
-    const staleArtifacts = [];
-    for (const stage of invalidatedStages) {
-        const artifactFile = stageSchema(stage).artifactFile;
-        const artifactPath = path.join(activeArtifactsPath(projectRoot), artifactFile);
-        if (!(await exists(artifactPath))) {
-            continue;
-        }
-        await ensureDir(archivePath);
-        await ensureDir(path.join(archivePath, path.dirname(artifactFile)));
-        await fs.copyFile(artifactPath, path.join(archivePath, artifactFile));
-        const staleName = staleArtifactFileName(artifactFile);
-        const stalePath = path.join(activeArtifactsPath(projectRoot), staleName);
-        await fs.rm(stalePath, { force: true });
-        await fs.rename(artifactPath, stalePath);
-        staleArtifacts.push(staleName);
-    }
-    const rewindRecord = {
-        id: rewindId,
-        fromStage: state.currentStage,
-        toStage: options.to,
-        reason,
-        timestamp: nowIso,
-        invalidatedStages
-    };
-    const nextState = {
-        ...state,
-        currentStage: options.to,
-        completedStages: nextCompletedStages,
-        guardEvidence: nextGuardEvidence,
-        stageGateCatalog: nextCatalog,
-        staleStages: nextStale,
-        rewinds: [...state.rewinds, rewindRecord]
-    };
-    await writeFlowState(projectRoot, nextState, { allowReset: true });
-    const rewindLogEntry = {
-        ...rewindRecord,
-        track,
-        runId: state.activeRunId,
-        staleArtifacts
-    };
-    await ensureDir(path.dirname(rewindLogPath(projectRoot)));
-    await fs.appendFile(rewindLogPath(projectRoot), `${JSON.stringify(rewindLogEntry)}\n`, "utf8");
-    return {
-        rewindId,
-        from: state.currentStage,
-        to: options.to,
-        invalidatedStages,
-        staleArtifacts,
-        archivePath,
-        nextState
-    };
-}
-export async function acknowledgeStaleStage(projectRoot, stage) {
-    await ensureRunSystem(projectRoot);
-    const state = await readFlowState(projectRoot);
-    const marker = state.staleStages[stage];
-    if (!marker) {
-        return {
-            acknowledged: false,
-            remaining: Object.keys(state.staleStages).filter((value) => isFlowStage(value))
-        };
-    }
-    const nextStale = { ...state.staleStages };
-    delete nextStale[stage];
-    const nextState = {
-        ...state,
-        staleStages: nextStale
-    };
-    await writeFlowState(projectRoot, nextState, { allowReset: true });
-    return {
-        acknowledged: true,
-        remaining: Object.keys(nextStale).filter((value) => isFlowStage(value))
-    };
-}
 const KNOWLEDGE_SOFT_THRESHOLD = 50;
 async function readKnowledgeStats(projectRoot) {
     const knowledgePath = path.join(projectRoot, RUNTIME_ROOT, "knowledge.jsonl");

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cclaw-cli",
-  "version": "0.21.1",
+  "version": "0.22.0",
   "description": "Installer-first flow toolkit for coding agents",
   "type": "module",
   "bin": {