npm - cclaw-cli - Versions diffs - 0.27.0 → 0.29.0 - Mend

cclaw-cli 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md +421 -64
package/dist/cli.d.ts +8 -4
package/dist/cli.js +318 -47
package/dist/constants.d.ts +1 -1
package/dist/constants.js +34 -1
package/dist/content/eval-scaffold.d.ts +2 -2
package/dist/content/eval-scaffold.js +7 -6
package/dist/content/start-command.d.ts +3 -2
package/dist/content/start-command.js +5 -4
package/dist/eval/agents/single-shot.d.ts +1 -1
package/dist/eval/agents/single-shot.js +4 -4
package/dist/eval/agents/with-tools.d.ts +6 -6
package/dist/eval/agents/with-tools.js +5 -5
package/dist/eval/agents/workflow.d.ts +7 -0
package/dist/eval/agents/workflow.js +5 -3
package/dist/eval/baseline.d.ts +24 -0
package/dist/eval/baseline.js +75 -2
package/dist/eval/config-loader.js +46 -17
package/dist/eval/cost-guard.d.ts +22 -0
package/dist/eval/cost-guard.js +38 -1
package/dist/eval/diff.d.ts +1 -1
package/dist/eval/diff.js +3 -3
package/dist/eval/llm-client.d.ts +13 -2
package/dist/eval/llm-client.js +8 -1
package/dist/eval/mode.d.ts +28 -0
package/dist/eval/mode.js +61 -0
package/dist/eval/progress.d.ts +83 -0
package/dist/eval/progress.js +59 -0
package/dist/eval/report.js +1 -1
package/dist/eval/runner.d.ts +29 -9
package/dist/eval/runner.js +148 -56
package/dist/eval/runs.d.ts +41 -0
package/dist/eval/runs.js +114 -0
package/dist/eval/sandbox.js +1 -1
package/dist/eval/tools/index.js +1 -1
package/dist/eval/tools/types.d.ts +1 -1
package/dist/eval/types.d.ts +54 -27
package/dist/eval/types.js +21 -9
package/dist/eval/verifiers/workflow-consistency.d.ts +1 -1
package/dist/eval/workflow-corpus.d.ts +2 -2
package/dist/eval/workflow-corpus.js +4 -4
package/dist/install.d.ts +10 -0
package/dist/install.js +19 -5
package/package.json +1 -1

package/dist/eval/types.d.ts CHANGED Viewed

@@ -11,20 +11,34 @@
  */
 import type { FlowStage } from "../types.js";
 /**
- * Fidelity tier for the agent-under-test.
+ * Evaluation mode — what the agent-under-test actually does.
  *
- * - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
- * - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
- * - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
- *   artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
- *   (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
+ * - `fixture` — verify an existing artifact against structural/rule/judge
+ *   expectations. No LLM drafting, only verifiers (judge may still invoke
+ *   the API). Cheapest mode.
+ * - `agent` — LLM drafts a single-stage artifact inside a sandbox using the
+ *   function-calling loop (read_file/write_file/glob/grep). Replaces the
+ *   previous single-shot path entirely.
+ * - `workflow` — LLM orchestrates the full multi-stage flow
+ *   (brainstorm → scope → design → spec → plan) with threaded artifacts.
+ *
+ * Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
+ * a deprecation warning — see `src/eval/mode.ts` for the mapping.
+ */
+export declare const EVAL_MODES: readonly ["fixture", "agent", "workflow"];
+export type EvalMode = (typeof EVAL_MODES)[number];
+/**
+ * Legacy tier identifier, kept so on-disk reports generated before v0.28.0
+ * keep parsing. New code should always use `EvalMode`.
+ * @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
  */
 export declare const EVAL_TIERS: readonly ["A", "B", "C"];
+/** @deprecated use `EvalMode`. */
 export type EvalTier = (typeof EVAL_TIERS)[number];
 /**
  * Verifier kinds, in increasing cost and decreasing determinism:
  * structural and rules run without LLM; judge and workflow use the configured model.
- * `consistency` is the Tier C cross-artifact family (deterministic but
+ * `consistency` is the workflow-mode cross-artifact family (deterministic but
  * operates over multiple artifacts at once).
  */
 export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow", "consistency"];
@@ -120,7 +134,7 @@ export interface TraceabilityExpected {
  * LLM-judge expectations — Step 3.
  *
  * When present, the judge runs against the resolved artifact (live-agent
- * output in Tier A/B/C, or the pre-generated fixture when `--judge` is
+ * output in agent/workflow mode, or the pre-generated fixture when `--judge` is
  * combined with `--schema-only` for smoke tests). Every field below is
  * optional; the case-level hint overlays the stage-level rubric loaded
  * from `.cclaw/evals/rubrics/<stage>.yaml`.
@@ -161,7 +175,7 @@ export interface EvalCase {
     id: string;
     stage: FlowStage;
     inputPrompt: string;
-    /** Project files copied into the Tier B/C sandbox before the agent runs. */
+    /** Project files copied into the agent/workflow sandbox before the agent runs. */
     contextFiles?: string[];
     /**
      * Typed expectation hints consumed by the structural/rules/judge verifiers.
@@ -196,14 +210,15 @@ export interface VerifierResult {
 export interface EvalCaseResult {
     caseId: string;
     stage: FlowStage;
-    tier: EvalTier;
+    mode: EvalMode;
     passed: boolean;
     durationMs: number;
     costUsd?: number;
     verifierResults: VerifierResult[];
     /**
-     * Tier C only: the per-stage breakdown collected by the workflow
-     * agent. Unset for Tier A/B cases so the on-disk JSON stays small.
+     * Only populated in `workflow` mode: per-stage breakdown collected by
+     * the workflow orchestrator. Unset for `fixture` / `agent` modes so the
+     * on-disk JSON stays small.
      */
     workflow?: WorkflowRunSummary;
 }
@@ -215,7 +230,7 @@ export interface EvalReport {
     cclawVersion: string;
     provider: string;
     model: string;
-    tier: EvalTier;
+    mode: EvalMode;
     stages: FlowStage[];
     cases: EvalCaseResult[];
     summary: {
@@ -245,8 +260,8 @@ export interface EvalConfig {
     model: string;
     /** Optional separate model for the judge role. Defaults to `model`. */
     judgeModel?: string;
-    /** Default tier when `--tier` is not supplied. */
-    defaultTier: EvalTier;
+    /** Default mode when `--mode` is not supplied. */
+    defaultMode: EvalMode;
     /** Optional hard stop on estimated USD spend per day. Unset = no cap. */
     dailyUsdCap?: number;
     /** Regression thresholds for CI gates. */
@@ -277,7 +292,7 @@ export interface EvalConfig {
     tokenPricing?: Record<string, TokenPricing>;
     /**
      * Maximum assistant turns (tool_calls → tool result cycles) allowed by
-     * the Tier B with-tools agent. Defaults to 8 when unset. Runs that
+     * the with-tools agent loop (agent/workflow mode). Defaults to 8. Runs that
      * exceed the cap fail with a `MaxTurnsExceededError` and surface as a
      * workflow verifier result.
      */
@@ -294,7 +309,7 @@ export interface EvalConfig {
      */
     toolMaxResultBytes?: number;
     /**
-     * Maximum total turns a single Tier C workflow case may consume
+     * Maximum total turns a single workflow-mode case may consume
      * across all stages combined. Defaults to 40 (stages × toolMaxTurns).
      * Runs that exceed the cap fail the current stage with a
      * `MaxTurnsExceededError` propagated from the underlying with-tools
@@ -325,6 +340,18 @@ export interface BaselineSnapshot {
     cclawVersion: string;
     /** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
     cases: Record<string, BaselineCaseEntry>;
+    /**
+     * Tamper-evident signature computed as sha256 over the canonical JSON
+     * encoding of `{ schemaVersion, stage, cases }`. Present on files
+     * written by cclaw >= 0.28.0; older baselines load with `signature`
+     * absent and the loader skips verification.
+     */
+    signature?: {
+        algorithm: "sha256";
+        digest: string;
+        /** ISO timestamp of when the digest was computed. */
+        signedAt: string;
+    };
 }
 export interface BaselineCaseEntry {
     passed: boolean;
@@ -415,7 +442,7 @@ export interface JudgeInvocation {
     durationMs: number;
 }
 /**
- * Tool-use summary produced by the Tier B with-tools agent. Captured so
+ * Tool-use summary produced by the with-tools agent loop. Captured so
  * the runner can surface per-case tool metrics in the markdown report
  * (number of calls, depth, error rate, denied paths).
  */
@@ -432,7 +459,7 @@ export interface ToolUseSummary {
     byTool: Record<string, number>;
 }
 /**
- * Cross-stage consistency expectations for a Tier C workflow case. Every
+ * Cross-stage consistency expectations for a workflow-mode case. Every
  * sub-check is optional so authors can opt in incrementally; an empty
  * block produces zero verifier results.
  */
@@ -470,8 +497,8 @@ export interface WorkflowConsistencyExpected {
     }>;
 }
 /**
- * A single stage step inside a Tier C workflow case. The stage's
- * `inputPrompt` is handed to the Tier B with-tools agent with prior-stage
+ * A single stage step inside a workflow-mode case. The stage's
+ * `inputPrompt` is handed to the with-tools agent loop with prior-stage
  * artifacts seeded into the sandbox under `stages/<name>.md`.
  */
 export interface WorkflowStageStep {
@@ -485,14 +512,14 @@ export interface WorkflowStageStep {
     minimumScores?: Record<string, number>;
 }
 /**
- * Supported workflow stages. Deliberately a subset of `FlowStage` —
- * Tier C covers the early "design" arc of a project. TDD/review/ship
+ * Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
+ * the workflow mode covers the early "design" arc of a project. TDD/review/ship
  * are out of scope (they require real code execution).
  */
 export declare const WORKFLOW_STAGES: readonly ["brainstorm", "scope", "design", "spec", "plan"];
 export type WorkflowStageName = (typeof WORKFLOW_STAGES)[number];
 /**
- * A Tier C workflow case. Lives under
+ * A workflow-mode case. Lives under
  * `.cclaw/evals/corpus/workflows/<id>.yaml` and wires a multi-stage run
  * through the with-tools agent.
  */
@@ -504,10 +531,10 @@ export interface WorkflowCase {
     contextFiles?: string[];
     /** Ordered list of stages to run. Must be non-empty. */
     stages: WorkflowStageStep[];
-    /** Cross-stage consistency checks (Tier C-specific verifier family). */
+    /** Cross-stage consistency checks (workflow-mode verifier family). */
     consistency?: WorkflowConsistencyExpected;
 }
-/** Per-stage record inside a Tier C workflow run. */
+/** Per-stage record inside a workflow-mode run. */
 export interface WorkflowStageResult {
     stage: WorkflowStageName;
     artifact: string;
@@ -523,7 +550,7 @@ export interface WorkflowStageResult {
     /** Per-rubric-check medians keyed by check id (for the report). */
     judgeMedians?: Record<string, number>;
 }
-/** Tier C orchestration output collected by the runner. */
+/** Workflow-mode orchestration output collected by the runner. */
 export interface WorkflowRunSummary {
     caseId: string;
     stages: WorkflowStageResult[];

package/dist/eval/types.js CHANGED Viewed

@@ -1,17 +1,29 @@
 /**
- * Fidelity tier for the agent-under-test.
+ * Evaluation mode — what the agent-under-test actually does.
  *
- * - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
- * - `B` — SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
- * - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
- *   artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
- *   (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
+ * - `fixture` — verify an existing artifact against structural/rule/judge
+ *   expectations. No LLM drafting, only verifiers (judge may still invoke
+ *   the API). Cheapest mode.
+ * - `agent` — LLM drafts a single-stage artifact inside a sandbox using the
+ *   function-calling loop (read_file/write_file/glob/grep). Replaces the
+ *   previous single-shot path entirely.
+ * - `workflow` — LLM orchestrates the full multi-stage flow
+ *   (brainstorm → scope → design → spec → plan) with threaded artifacts.
+ *
+ * Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
+ * a deprecation warning — see `src/eval/mode.ts` for the mapping.
+ */
+export const EVAL_MODES = ["fixture", "agent", "workflow"];
+/**
+ * Legacy tier identifier, kept so on-disk reports generated before v0.28.0
+ * keep parsing. New code should always use `EvalMode`.
+ * @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
  */
 export const EVAL_TIERS = ["A", "B", "C"];
 /**
  * Verifier kinds, in increasing cost and decreasing determinism:
  * structural and rules run without LLM; judge and workflow use the configured model.
- * `consistency` is the Tier C cross-artifact family (deterministic but
+ * `consistency` is the workflow-mode cross-artifact family (deterministic but
  * operates over multiple artifacts at once).
  */
 export const VERIFIER_KINDS = [
@@ -22,8 +34,8 @@ export const VERIFIER_KINDS = [
     "consistency"
 ];
 /**
- * Supported workflow stages. Deliberately a subset of `FlowStage` —
- * Tier C covers the early "design" arc of a project. TDD/review/ship
+ * Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
+ * the workflow mode covers the early "design" arc of a project. TDD/review/ship
  * are out of scope (they require real code execution).
  */
 export const WORKFLOW_STAGES = [

package/dist/eval/verifiers/workflow-consistency.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 /**
- * Cross-artifact consistency verifier for Tier C.
+ * Cross-artifact consistency verifier for workflow mode.
  *
  * Operates over a `{ stage → artifact }` map produced by the workflow
  * agent and emits deterministic verifier results for:

package/dist/eval/workflow-corpus.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import type { WorkflowCase } from "./types.js";
 /**
- * Load every Tier C workflow case under
+ * Load every workflow-mode case under
  * `.cclaw/evals/corpus/workflows/*.yaml`. Returns an empty array when the
- * directory is missing — a fresh `cclaw init` has no Tier C corpus yet.
+ * directory is missing — a fresh `cclaw init` has no workflow corpus yet.
  */
 export declare function loadWorkflowCorpus(projectRoot: string): Promise<WorkflowCase[]>;

package/dist/eval/workflow-corpus.js CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
- * Workflow corpus loader for Tier C.
+ * Workflow corpus loader (workflow mode).
  *
- * Tier C cases live under `.cclaw/evals/corpus/workflows/<id>.yaml` and
+ * Workflow-mode cases live under `.cclaw/evals/corpus/workflows/<id>.yaml` and
  * describe a multi-stage run that chains the with-tools agent across
  * `brainstorm → scope → design → spec → plan`. Unlike single-stage
  * cases (which are keyed by stage folder), workflow cases ship as a
@@ -177,9 +177,9 @@ function validateWorkflowCase(filePath, raw) {
     return out;
 }
 /**
- * Load every Tier C workflow case under
+ * Load every workflow-mode case under
  * `.cclaw/evals/corpus/workflows/*.yaml`. Returns an empty array when the
- * directory is missing — a fresh `cclaw init` has no Tier C corpus yet.
+ * directory is missing — a fresh `cclaw init` has no workflow corpus yet.
  */
 export async function loadWorkflowCorpus(projectRoot) {
     const dir = path.join(projectRoot, EVALS_ROOT, "corpus", "workflows");

package/dist/install.d.ts CHANGED Viewed

@@ -8,5 +8,15 @@ export interface InitOptions {
 }
 export declare function initCclaw(options: InitOptions): Promise<void>;
 export declare function syncCclaw(projectRoot: string): Promise<void>;
+/**
+ * Refresh generated files in `.cclaw/` without touching user-authored
+ * artifacts, state, or custom config keys. Only the `version` + `flowVersion`
+ * stamps are rewritten so the on-disk config reflects the installed CLI;
+ * `promptGuardMode`, `tddEnforcement`, `gitHookGuards`, `languageRulePacks`,
+ * and `trackHeuristics` are preserved verbatim from the existing config.
+ *
+ * For an explicit reset to the default profile the user should reinstall via
+ * `cclaw init --profile=<id>` (after optionally archiving the current run).
+ */
 export declare function upgradeCclaw(projectRoot: string): Promise<void>;
 export declare function uninstallCclaw(projectRoot: string): Promise<void>;

package/dist/install.js CHANGED Viewed

@@ -2,7 +2,7 @@ import { execFile } from "node:child_process";
 import fs from "node:fs/promises";
 import path from "node:path";
 import { promisify } from "node:util";
-import { COMMAND_FILE_ORDER, REQUIRED_DIRS, RUNTIME_ROOT } from "./constants.js";
+import { CCLAW_VERSION, COMMAND_FILE_ORDER, FLOW_VERSION, REQUIRED_DIRS, RUNTIME_ROOT } from "./constants.js";
 import { writeConfig, createDefaultConfig, createProfileConfig, readConfig, configPath } from "./config.js";
 import { commandContract } from "./content/contracts.js";
 import { contextModeFiles, createInitialContextModeState } from "./content/contexts.js";
@@ -1104,11 +1104,25 @@ export async function syncCclaw(projectRoot) {
     }
     await materializeRuntime(projectRoot, config, false);
 }
+/**
+ * Refresh generated files in `.cclaw/` without touching user-authored
+ * artifacts, state, or custom config keys. Only the `version` + `flowVersion`
+ * stamps are rewritten so the on-disk config reflects the installed CLI;
+ * `promptGuardMode`, `tddEnforcement`, `gitHookGuards`, `languageRulePacks`,
+ * and `trackHeuristics` are preserved verbatim from the existing config.
+ *
+ * For an explicit reset to the default profile the user should reinstall via
+ * `cclaw init --profile=<id>` (after optionally archiving the current run).
+ */
 export async function upgradeCclaw(projectRoot) {
-    const config = await readConfig(projectRoot);
-    const upgradedConfig = createDefaultConfig(config.harnesses);
-    await writeConfig(projectRoot, upgradedConfig);
-    await materializeRuntime(projectRoot, upgradedConfig, false);
+    const existing = await readConfig(projectRoot);
+    const upgraded = {
+        ...existing,
+        version: CCLAW_VERSION,
+        flowVersion: FLOW_VERSION
+    };
+    await writeConfig(projectRoot, upgraded);
+    await materializeRuntime(projectRoot, upgraded, false);
 }
 function stripManagedHookCommands(value) {
     if (!value || typeof value !== "object" || Array.isArray(value)) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cclaw-cli",
-  "version": "0.27.0",
+  "version": "0.29.0",
   "description": "Installer-first flow toolkit for coding agents",
   "type": "module",
   "bin": {