cclaw-cli 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ /**
2
+ * Run bookkeeping for backgrounded `cclaw eval` invocations.
3
+ *
4
+ * A backgrounded run writes three artifacts under `.cclaw/evals/runs/<id>/`:
5
+ *
6
+ * - `run.json` — status metadata (pid, started/ended ISO timestamps,
7
+ * exit code, argv, cwd). Updated at start and at exit.
8
+ * - `run.log` — combined stdout+stderr of the child process. This is
9
+ * what `cclaw eval runs tail` streams.
10
+ * - `run.pid` — just the pid, written atomically so `runs status`
11
+ * can probe liveness without parsing JSON.
12
+ *
13
+ * The `id` is a short alphanumeric string (8 chars + ISO timestamp prefix)
14
+ * chosen so sorting directory entries by name produces a chronological
15
+ * listing without any extra work.
16
+ */
17
+ import { randomBytes } from "node:crypto";
18
+ import fs from "node:fs/promises";
19
+ import path from "node:path";
20
+ import { EVALS_ROOT } from "../constants.js";
21
+ import { exists } from "../fs-utils.js";
22
+ export const RUNS_DIR = "runs";
23
+ export function runsRoot(projectRoot) {
24
+ return path.join(projectRoot, EVALS_ROOT, RUNS_DIR);
25
+ }
26
+ export function runDir(projectRoot, id) {
27
+ return path.join(runsRoot(projectRoot), id);
28
+ }
29
+ export function runLogPath(projectRoot, id) {
30
+ return path.join(runDir(projectRoot, id), "run.log");
31
+ }
32
+ export function runStatusPath(projectRoot, id) {
33
+ return path.join(runDir(projectRoot, id), "run.json");
34
+ }
35
+ /**
36
+ * Generate a short, lexicographically-sortable run id. The timestamp
37
+ * prefix means `ls -1` already returns the runs in chronological order
38
+ * which keeps the `runs list` subcommand trivial.
39
+ */
40
+ export function generateRunId(now = new Date()) {
41
+ const ts = now.toISOString().replace(/[-:]/g, "").replace(/\.\d+Z$/, "Z");
42
+ const suffix = randomBytes(3).toString("hex");
43
+ return `${ts}-${suffix}`;
44
+ }
45
+ export async function ensureRunDir(projectRoot, id) {
46
+ const dir = runDir(projectRoot, id);
47
+ await fs.mkdir(dir, { recursive: true });
48
+ return dir;
49
+ }
50
+ export async function writeRunStatus(projectRoot, status) {
51
+ await ensureRunDir(projectRoot, status.id);
52
+ await fs.writeFile(runStatusPath(projectRoot, status.id), `${JSON.stringify(status, null, 2)}\n`, "utf8");
53
+ }
54
+ export async function readRunStatus(projectRoot, id) {
55
+ const file = runStatusPath(projectRoot, id);
56
+ if (!(await exists(file)))
57
+ return null;
58
+ try {
59
+ const raw = await fs.readFile(file, "utf8");
60
+ return JSON.parse(raw);
61
+ }
62
+ catch {
63
+ return null;
64
+ }
65
+ }
66
+ /**
67
+ * List run ids under `.cclaw/evals/runs/`, most recent first. Directory
68
+ * entries that don't contain a `run.json` are skipped (half-initialized
69
+ * or manually mkdir'd folders).
70
+ */
71
+ export async function listRuns(projectRoot) {
72
+ const root = runsRoot(projectRoot);
73
+ if (!(await exists(root)))
74
+ return [];
75
+ const entries = await fs.readdir(root, { withFileTypes: true });
76
+ const out = [];
77
+ for (const entry of entries) {
78
+ if (!entry.isDirectory())
79
+ continue;
80
+ const status = await readRunStatus(projectRoot, entry.name);
81
+ if (status)
82
+ out.push(status);
83
+ }
84
+ out.sort((a, b) => (a.startedAt < b.startedAt ? 1 : -1));
85
+ return out;
86
+ }
87
+ /**
88
+ * Resolve `"latest"` (or undefined) to the most recent run id.
89
+ * Returns `null` when there are no runs.
90
+ */
91
+ export async function resolveRunId(projectRoot, hint) {
92
+ if (hint && hint !== "latest") {
93
+ const status = await readRunStatus(projectRoot, hint);
94
+ return status ? hint : null;
95
+ }
96
+ const runs = await listRuns(projectRoot);
97
+ return runs[0]?.id ?? null;
98
+ }
99
+ /**
100
+ * Cheap liveness probe for an EvalRunStatus. A `run.json` can be stale
101
+ * (process crashed mid-commit), so we double-check with `kill(pid, 0)`
102
+ * before trusting the `state: "running"` field.
103
+ */
104
+ export function isRunAlive(status) {
105
+ if (status.state !== "running")
106
+ return false;
107
+ try {
108
+ process.kill(status.pid, 0);
109
+ return true;
110
+ }
111
+ catch {
112
+ return false;
113
+ }
114
+ }
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Per-case sandbox for the Tier B with-tools agent.
2
+ * Per-case sandbox for the with-tools agent (agent/workflow mode).
3
3
  *
4
4
  * Every case gets its own `os.tmpdir()/cclaw-eval-<uuid>/` directory. Any
5
5
  * `contextFiles` the case declares are copied in relative to the project
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Registry of sandbox-confined tools used by the Tier B with-tools agent.
2
+ * Registry of sandbox-confined tools used by the with-tools agent (agent/workflow mode).
3
3
  *
4
4
  * The registry order defines the advertised schema order in the
5
5
  * function-calling payload. Keeping it stable means judges reading
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Shared types for Tier B sandbox-confined tools.
2
+ * Shared types for sandbox-confined tools (agent/workflow mode).
3
3
  *
4
4
  * Tools are plain async functions: they take validated arguments and a
5
5
  * sandbox handle and return a structured result. The runner serializes
@@ -11,20 +11,34 @@
11
11
  */
12
12
  import type { FlowStage } from "../types.js";
13
13
  /**
14
- * Fidelity tier for the agent-under-test.
14
+ * Evaluation mode what the agent-under-test actually does.
15
15
  *
16
- * - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
17
- * - `B` SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
18
- * - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
19
- * artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
20
- * (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
16
+ * - `fixture` — verify an existing artifact against structural/rule/judge
17
+ * expectations. No LLM drafting, only verifiers (judge may still invoke
18
+ * the API). Cheapest mode.
19
+ * - `agent` LLM drafts a single-stage artifact inside a sandbox using the
20
+ * function-calling loop (read_file/write_file/glob/grep). Replaces the
21
+ * previous single-shot path entirely.
22
+ * - `workflow` — LLM orchestrates the full multi-stage flow
23
+ * (brainstorm → scope → design → spec → plan) with threaded artifacts.
24
+ *
25
+ * Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
26
+ * a deprecation warning — see `src/eval/mode.ts` for the mapping.
27
+ */
28
+ export declare const EVAL_MODES: readonly ["fixture", "agent", "workflow"];
29
+ export type EvalMode = (typeof EVAL_MODES)[number];
30
+ /**
31
+ * Legacy tier identifier, kept so on-disk reports generated before v0.28.0
32
+ * keep parsing. New code should always use `EvalMode`.
33
+ * @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
21
34
  */
22
35
  export declare const EVAL_TIERS: readonly ["A", "B", "C"];
36
+ /** @deprecated use `EvalMode`. */
23
37
  export type EvalTier = (typeof EVAL_TIERS)[number];
24
38
  /**
25
39
  * Verifier kinds, in increasing cost and decreasing determinism:
26
40
  * structural and rules run without LLM; judge and workflow use the configured model.
27
- * `consistency` is the Tier C cross-artifact family (deterministic but
41
+ * `consistency` is the workflow-mode cross-artifact family (deterministic but
28
42
  * operates over multiple artifacts at once).
29
43
  */
30
44
  export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow", "consistency"];
@@ -120,7 +134,7 @@ export interface TraceabilityExpected {
120
134
  * LLM-judge expectations — Step 3.
121
135
  *
122
136
  * When present, the judge runs against the resolved artifact (live-agent
123
- * output in Tier A/B/C, or the pre-generated fixture when `--judge` is
137
+ * output in agent/workflow mode, or the pre-generated fixture when `--judge` is
124
138
  * combined with `--schema-only` for smoke tests). Every field below is
125
139
  * optional; the case-level hint overlays the stage-level rubric loaded
126
140
  * from `.cclaw/evals/rubrics/<stage>.yaml`.
@@ -161,7 +175,7 @@ export interface EvalCase {
161
175
  id: string;
162
176
  stage: FlowStage;
163
177
  inputPrompt: string;
164
- /** Project files copied into the Tier B/C sandbox before the agent runs. */
178
+ /** Project files copied into the agent/workflow sandbox before the agent runs. */
165
179
  contextFiles?: string[];
166
180
  /**
167
181
  * Typed expectation hints consumed by the structural/rules/judge verifiers.
@@ -196,14 +210,15 @@ export interface VerifierResult {
196
210
  export interface EvalCaseResult {
197
211
  caseId: string;
198
212
  stage: FlowStage;
199
- tier: EvalTier;
213
+ mode: EvalMode;
200
214
  passed: boolean;
201
215
  durationMs: number;
202
216
  costUsd?: number;
203
217
  verifierResults: VerifierResult[];
204
218
  /**
205
- * Tier C only: the per-stage breakdown collected by the workflow
206
- * agent. Unset for Tier A/B cases so the on-disk JSON stays small.
219
+ * Only populated in `workflow` mode: per-stage breakdown collected by
220
+ * the workflow orchestrator. Unset for `fixture` / `agent` modes so the
221
+ * on-disk JSON stays small.
207
222
  */
208
223
  workflow?: WorkflowRunSummary;
209
224
  }
@@ -215,7 +230,7 @@ export interface EvalReport {
215
230
  cclawVersion: string;
216
231
  provider: string;
217
232
  model: string;
218
- tier: EvalTier;
233
+ mode: EvalMode;
219
234
  stages: FlowStage[];
220
235
  cases: EvalCaseResult[];
221
236
  summary: {
@@ -245,8 +260,8 @@ export interface EvalConfig {
245
260
  model: string;
246
261
  /** Optional separate model for the judge role. Defaults to `model`. */
247
262
  judgeModel?: string;
248
- /** Default tier when `--tier` is not supplied. */
249
- defaultTier: EvalTier;
263
+ /** Default mode when `--mode` is not supplied. */
264
+ defaultMode: EvalMode;
250
265
  /** Optional hard stop on estimated USD spend per day. Unset = no cap. */
251
266
  dailyUsdCap?: number;
252
267
  /** Regression thresholds for CI gates. */
@@ -277,7 +292,7 @@ export interface EvalConfig {
277
292
  tokenPricing?: Record<string, TokenPricing>;
278
293
  /**
279
294
  * Maximum assistant turns (tool_calls → tool result cycles) allowed by
280
- * the Tier B with-tools agent. Defaults to 8 when unset. Runs that
295
+ * the with-tools agent loop (agent/workflow mode). Defaults to 8. Runs that
281
296
  * exceed the cap fail with a `MaxTurnsExceededError` and surface as a
282
297
  * workflow verifier result.
283
298
  */
@@ -294,7 +309,7 @@ export interface EvalConfig {
294
309
  */
295
310
  toolMaxResultBytes?: number;
296
311
  /**
297
- * Maximum total turns a single Tier C workflow case may consume
312
+ * Maximum total turns a single workflow-mode case may consume
298
313
  * across all stages combined. Defaults to 40 (stages × toolMaxTurns).
299
314
  * Runs that exceed the cap fail the current stage with a
300
315
  * `MaxTurnsExceededError` propagated from the underlying with-tools
@@ -325,6 +340,18 @@ export interface BaselineSnapshot {
325
340
  cclawVersion: string;
326
341
  /** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
327
342
  cases: Record<string, BaselineCaseEntry>;
343
+ /**
344
+ * Tamper-evident signature computed as sha256 over the canonical JSON
345
+ * encoding of `{ schemaVersion, stage, cases }`. Present on files
346
+ * written by cclaw >= 0.28.0; older baselines load with `signature`
347
+ * absent and the loader skips verification.
348
+ */
349
+ signature?: {
350
+ algorithm: "sha256";
351
+ digest: string;
352
+ /** ISO timestamp of when the digest was computed. */
353
+ signedAt: string;
354
+ };
328
355
  }
329
356
  export interface BaselineCaseEntry {
330
357
  passed: boolean;
@@ -415,7 +442,7 @@ export interface JudgeInvocation {
415
442
  durationMs: number;
416
443
  }
417
444
  /**
418
- * Tool-use summary produced by the Tier B with-tools agent. Captured so
445
+ * Tool-use summary produced by the with-tools agent loop. Captured so
419
446
  * the runner can surface per-case tool metrics in the markdown report
420
447
  * (number of calls, depth, error rate, denied paths).
421
448
  */
@@ -432,7 +459,7 @@ export interface ToolUseSummary {
432
459
  byTool: Record<string, number>;
433
460
  }
434
461
  /**
435
- * Cross-stage consistency expectations for a Tier C workflow case. Every
462
+ * Cross-stage consistency expectations for a workflow-mode case. Every
436
463
  * sub-check is optional so authors can opt in incrementally; an empty
437
464
  * block produces zero verifier results.
438
465
  */
@@ -470,8 +497,8 @@ export interface WorkflowConsistencyExpected {
470
497
  }>;
471
498
  }
472
499
  /**
473
- * A single stage step inside a Tier C workflow case. The stage's
474
- * `inputPrompt` is handed to the Tier B with-tools agent with prior-stage
500
+ * A single stage step inside a workflow-mode case. The stage's
501
+ * `inputPrompt` is handed to the with-tools agent loop with prior-stage
475
502
  * artifacts seeded into the sandbox under `stages/<name>.md`.
476
503
  */
477
504
  export interface WorkflowStageStep {
@@ -485,14 +512,14 @@ export interface WorkflowStageStep {
485
512
  minimumScores?: Record<string, number>;
486
513
  }
487
514
  /**
488
- * Supported workflow stages. Deliberately a subset of `FlowStage` —
489
- * Tier C covers the early "design" arc of a project. TDD/review/ship
515
+ * Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
516
+ * the workflow mode covers the early "design" arc of a project. TDD/review/ship
490
517
  * are out of scope (they require real code execution).
491
518
  */
492
519
  export declare const WORKFLOW_STAGES: readonly ["brainstorm", "scope", "design", "spec", "plan"];
493
520
  export type WorkflowStageName = (typeof WORKFLOW_STAGES)[number];
494
521
  /**
495
- * A Tier C workflow case. Lives under
522
+ * A workflow-mode case. Lives under
496
523
  * `.cclaw/evals/corpus/workflows/<id>.yaml` and wires a multi-stage run
497
524
  * through the with-tools agent.
498
525
  */
@@ -504,10 +531,10 @@ export interface WorkflowCase {
504
531
  contextFiles?: string[];
505
532
  /** Ordered list of stages to run. Must be non-empty. */
506
533
  stages: WorkflowStageStep[];
507
- /** Cross-stage consistency checks (Tier C-specific verifier family). */
534
+ /** Cross-stage consistency checks (workflow-mode verifier family). */
508
535
  consistency?: WorkflowConsistencyExpected;
509
536
  }
510
- /** Per-stage record inside a Tier C workflow run. */
537
+ /** Per-stage record inside a workflow-mode run. */
511
538
  export interface WorkflowStageResult {
512
539
  stage: WorkflowStageName;
513
540
  artifact: string;
@@ -523,7 +550,7 @@ export interface WorkflowStageResult {
523
550
  /** Per-rubric-check medians keyed by check id (for the report). */
524
551
  judgeMedians?: Record<string, number>;
525
552
  }
526
- /** Tier C orchestration output collected by the runner. */
553
+ /** Workflow-mode orchestration output collected by the runner. */
527
554
  export interface WorkflowRunSummary {
528
555
  caseId: string;
529
556
  stages: WorkflowStageResult[];
@@ -1,17 +1,29 @@
1
1
  /**
2
- * Fidelity tier for the agent-under-test.
2
+ * Evaluation mode what the agent-under-test actually does.
3
3
  *
4
- * - `A` — single-shot API call, no tools. Cheap, validates core prompt behavior.
5
- * - `B` SDK loop with function-calling for Read/Write/Glob/Grep inside a sandbox.
6
- * - `C` — multi-stage workflow run (brainstorm -> scope -> ... -> plan) with threaded
7
- * artifacts. Most realistic tier we ship in Phase 7; literal IDE-harness runs
8
- * (claude-code / cursor-agent proxied to OpenAI-compat) are deferred to Phase 8.
4
+ * - `fixture` — verify an existing artifact against structural/rule/judge
5
+ * expectations. No LLM drafting, only verifiers (judge may still invoke
6
+ * the API). Cheapest mode.
7
+ * - `agent` LLM drafts a single-stage artifact inside a sandbox using the
8
+ * function-calling loop (read_file/write_file/glob/grep). Replaces the
9
+ * previous single-shot path entirely.
10
+ * - `workflow` — LLM orchestrates the full multi-stage flow
11
+ * (brainstorm → scope → design → spec → plan) with threaded artifacts.
12
+ *
13
+ * Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
14
+ * a deprecation warning — see `src/eval/mode.ts` for the mapping.
15
+ */
16
+ export const EVAL_MODES = ["fixture", "agent", "workflow"];
17
+ /**
18
+ * Legacy tier identifier, kept so on-disk reports generated before v0.28.0
19
+ * keep parsing. New code should always use `EvalMode`.
20
+ * @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
9
21
  */
10
22
  export const EVAL_TIERS = ["A", "B", "C"];
11
23
  /**
12
24
  * Verifier kinds, in increasing cost and decreasing determinism:
13
25
  * structural and rules run without LLM; judge and workflow use the configured model.
14
- * `consistency` is the Tier C cross-artifact family (deterministic but
26
+ * `consistency` is the workflow-mode cross-artifact family (deterministic but
15
27
  * operates over multiple artifacts at once).
16
28
  */
17
29
  export const VERIFIER_KINDS = [
@@ -22,8 +34,8 @@ export const VERIFIER_KINDS = [
22
34
  "consistency"
23
35
  ];
24
36
  /**
25
- * Supported workflow stages. Deliberately a subset of `FlowStage` —
26
- * Tier C covers the early "design" arc of a project. TDD/review/ship
37
+ * Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
38
+ * the workflow mode covers the early "design" arc of a project. TDD/review/ship
27
39
  * are out of scope (they require real code execution).
28
40
  */
29
41
  export const WORKFLOW_STAGES = [
@@ -1,5 +1,5 @@
1
1
  /**
2
- * Cross-artifact consistency verifier for Tier C.
2
+ * Cross-artifact consistency verifier for workflow mode.
3
3
  *
4
4
  * Operates over a `{ stage → artifact }` map produced by the workflow
5
5
  * agent and emits deterministic verifier results for:
@@ -1,7 +1,7 @@
1
1
  import type { WorkflowCase } from "./types.js";
2
2
  /**
3
- * Load every Tier C workflow case under
3
+ * Load every workflow-mode case under
4
4
  * `.cclaw/evals/corpus/workflows/*.yaml`. Returns an empty array when the
5
- * directory is missing — a fresh `cclaw init` has no Tier C corpus yet.
5
+ * directory is missing — a fresh `cclaw init` has no workflow corpus yet.
6
6
  */
7
7
  export declare function loadWorkflowCorpus(projectRoot: string): Promise<WorkflowCase[]>;
@@ -1,7 +1,7 @@
1
1
  /**
2
- * Workflow corpus loader for Tier C.
2
+ * Workflow corpus loader (workflow mode).
3
3
  *
4
- * Tier C cases live under `.cclaw/evals/corpus/workflows/<id>.yaml` and
4
+ * Workflow-mode cases live under `.cclaw/evals/corpus/workflows/<id>.yaml` and
5
5
  * describe a multi-stage run that chains the with-tools agent across
6
6
  * `brainstorm → scope → design → spec → plan`. Unlike single-stage
7
7
  * cases (which are keyed by stage folder), workflow cases ship as a
@@ -177,9 +177,9 @@ function validateWorkflowCase(filePath, raw) {
177
177
  return out;
178
178
  }
179
179
  /**
180
- * Load every Tier C workflow case under
180
+ * Load every workflow-mode case under
181
181
  * `.cclaw/evals/corpus/workflows/*.yaml`. Returns an empty array when the
182
- * directory is missing — a fresh `cclaw init` has no Tier C corpus yet.
182
+ * directory is missing — a fresh `cclaw init` has no workflow corpus yet.
183
183
  */
184
184
  export async function loadWorkflowCorpus(projectRoot) {
185
185
  const dir = path.join(projectRoot, EVALS_ROOT, "corpus", "workflows");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cclaw-cli",
3
- "version": "0.27.0",
3
+ "version": "0.28.0",
4
4
  "description": "Installer-first flow toolkit for coding agents",
5
5
  "type": "module",
6
6
  "bin": {