cclaw-cli 0.27.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +421 -64
- package/dist/cli.d.ts +8 -4
- package/dist/cli.js +318 -47
- package/dist/constants.d.ts +1 -1
- package/dist/constants.js +34 -1
- package/dist/content/eval-scaffold.d.ts +2 -2
- package/dist/content/eval-scaffold.js +7 -6
- package/dist/content/start-command.d.ts +3 -2
- package/dist/content/start-command.js +5 -4
- package/dist/eval/agents/single-shot.d.ts +1 -1
- package/dist/eval/agents/single-shot.js +4 -4
- package/dist/eval/agents/with-tools.d.ts +6 -6
- package/dist/eval/agents/with-tools.js +5 -5
- package/dist/eval/agents/workflow.d.ts +7 -0
- package/dist/eval/agents/workflow.js +5 -3
- package/dist/eval/baseline.d.ts +24 -0
- package/dist/eval/baseline.js +75 -2
- package/dist/eval/config-loader.js +46 -17
- package/dist/eval/cost-guard.d.ts +22 -0
- package/dist/eval/cost-guard.js +38 -1
- package/dist/eval/diff.d.ts +1 -1
- package/dist/eval/diff.js +3 -3
- package/dist/eval/llm-client.d.ts +13 -2
- package/dist/eval/llm-client.js +8 -1
- package/dist/eval/mode.d.ts +28 -0
- package/dist/eval/mode.js +61 -0
- package/dist/eval/progress.d.ts +83 -0
- package/dist/eval/progress.js +59 -0
- package/dist/eval/report.js +1 -1
- package/dist/eval/runner.d.ts +29 -9
- package/dist/eval/runner.js +148 -56
- package/dist/eval/runs.d.ts +41 -0
- package/dist/eval/runs.js +114 -0
- package/dist/eval/sandbox.js +1 -1
- package/dist/eval/tools/index.js +1 -1
- package/dist/eval/tools/types.d.ts +1 -1
- package/dist/eval/types.d.ts +54 -27
- package/dist/eval/types.js +21 -9
- package/dist/eval/verifiers/workflow-consistency.d.ts +1 -1
- package/dist/eval/workflow-corpus.d.ts +2 -2
- package/dist/eval/workflow-corpus.js +4 -4
- package/dist/install.d.ts +10 -0
- package/dist/install.js +19 -5
- package/package.json +1 -1
package/dist/eval/types.d.ts
CHANGED
|
@@ -11,20 +11,34 @@
|
|
|
11
11
|
*/
|
|
12
12
|
import type { FlowStage } from "../types.js";
|
|
13
13
|
/**
|
|
14
|
-
*
|
|
14
|
+
* Evaluation mode — what the agent-under-test actually does.
|
|
15
15
|
*
|
|
16
|
-
* - `
|
|
17
|
-
*
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
16
|
+
* - `fixture` — verify an existing artifact against structural/rule/judge
|
|
17
|
+
* expectations. No LLM drafting, only verifiers (judge may still invoke
|
|
18
|
+
* the API). Cheapest mode.
|
|
19
|
+
* - `agent` — LLM drafts a single-stage artifact inside a sandbox using the
|
|
20
|
+
* function-calling loop (read_file/write_file/glob/grep). Replaces the
|
|
21
|
+
* previous single-shot path entirely.
|
|
22
|
+
* - `workflow` — LLM orchestrates the full multi-stage flow
|
|
23
|
+
* (brainstorm → scope → design → spec → plan) with threaded artifacts.
|
|
24
|
+
*
|
|
25
|
+
* Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
|
|
26
|
+
* a deprecation warning — see `src/eval/mode.ts` for the mapping.
|
|
27
|
+
*/
|
|
28
|
+
export declare const EVAL_MODES: readonly ["fixture", "agent", "workflow"];
|
|
29
|
+
export type EvalMode = (typeof EVAL_MODES)[number];
|
|
30
|
+
/**
|
|
31
|
+
* Legacy tier identifier, kept so on-disk reports generated before v0.28.0
|
|
32
|
+
* keep parsing. New code should always use `EvalMode`.
|
|
33
|
+
* @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
|
|
21
34
|
*/
|
|
22
35
|
export declare const EVAL_TIERS: readonly ["A", "B", "C"];
|
|
36
|
+
/** @deprecated use `EvalMode`. */
|
|
23
37
|
export type EvalTier = (typeof EVAL_TIERS)[number];
|
|
24
38
|
/**
|
|
25
39
|
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
26
40
|
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
27
|
-
* `consistency` is the
|
|
41
|
+
* `consistency` is the workflow-mode cross-artifact family (deterministic but
|
|
28
42
|
* operates over multiple artifacts at once).
|
|
29
43
|
*/
|
|
30
44
|
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow", "consistency"];
|
|
@@ -120,7 +134,7 @@ export interface TraceabilityExpected {
|
|
|
120
134
|
* LLM-judge expectations — Step 3.
|
|
121
135
|
*
|
|
122
136
|
* When present, the judge runs against the resolved artifact (live-agent
|
|
123
|
-
* output in
|
|
137
|
+
* output in agent/workflow mode, or the pre-generated fixture when `--judge` is
|
|
124
138
|
* combined with `--schema-only` for smoke tests). Every field below is
|
|
125
139
|
* optional; the case-level hint overlays the stage-level rubric loaded
|
|
126
140
|
* from `.cclaw/evals/rubrics/<stage>.yaml`.
|
|
@@ -161,7 +175,7 @@ export interface EvalCase {
|
|
|
161
175
|
id: string;
|
|
162
176
|
stage: FlowStage;
|
|
163
177
|
inputPrompt: string;
|
|
164
|
-
/** Project files copied into the
|
|
178
|
+
/** Project files copied into the agent/workflow sandbox before the agent runs. */
|
|
165
179
|
contextFiles?: string[];
|
|
166
180
|
/**
|
|
167
181
|
* Typed expectation hints consumed by the structural/rules/judge verifiers.
|
|
@@ -196,14 +210,15 @@ export interface VerifierResult {
|
|
|
196
210
|
export interface EvalCaseResult {
|
|
197
211
|
caseId: string;
|
|
198
212
|
stage: FlowStage;
|
|
199
|
-
|
|
213
|
+
mode: EvalMode;
|
|
200
214
|
passed: boolean;
|
|
201
215
|
durationMs: number;
|
|
202
216
|
costUsd?: number;
|
|
203
217
|
verifierResults: VerifierResult[];
|
|
204
218
|
/**
|
|
205
|
-
*
|
|
206
|
-
*
|
|
219
|
+
* Only populated in `workflow` mode: per-stage breakdown collected by
|
|
220
|
+
* the workflow orchestrator. Unset for `fixture` / `agent` modes so the
|
|
221
|
+
* on-disk JSON stays small.
|
|
207
222
|
*/
|
|
208
223
|
workflow?: WorkflowRunSummary;
|
|
209
224
|
}
|
|
@@ -215,7 +230,7 @@ export interface EvalReport {
|
|
|
215
230
|
cclawVersion: string;
|
|
216
231
|
provider: string;
|
|
217
232
|
model: string;
|
|
218
|
-
|
|
233
|
+
mode: EvalMode;
|
|
219
234
|
stages: FlowStage[];
|
|
220
235
|
cases: EvalCaseResult[];
|
|
221
236
|
summary: {
|
|
@@ -245,8 +260,8 @@ export interface EvalConfig {
|
|
|
245
260
|
model: string;
|
|
246
261
|
/** Optional separate model for the judge role. Defaults to `model`. */
|
|
247
262
|
judgeModel?: string;
|
|
248
|
-
/** Default
|
|
249
|
-
|
|
263
|
+
/** Default mode when `--mode` is not supplied. */
|
|
264
|
+
defaultMode: EvalMode;
|
|
250
265
|
/** Optional hard stop on estimated USD spend per day. Unset = no cap. */
|
|
251
266
|
dailyUsdCap?: number;
|
|
252
267
|
/** Regression thresholds for CI gates. */
|
|
@@ -277,7 +292,7 @@ export interface EvalConfig {
|
|
|
277
292
|
tokenPricing?: Record<string, TokenPricing>;
|
|
278
293
|
/**
|
|
279
294
|
* Maximum assistant turns (tool_calls → tool result cycles) allowed by
|
|
280
|
-
* the
|
|
295
|
+
* the with-tools agent loop (agent/workflow mode). Defaults to 8. Runs that
|
|
281
296
|
* exceed the cap fail with a `MaxTurnsExceededError` and surface as a
|
|
282
297
|
* workflow verifier result.
|
|
283
298
|
*/
|
|
@@ -294,7 +309,7 @@ export interface EvalConfig {
|
|
|
294
309
|
*/
|
|
295
310
|
toolMaxResultBytes?: number;
|
|
296
311
|
/**
|
|
297
|
-
* Maximum total turns a single
|
|
312
|
+
* Maximum total turns a single workflow-mode case may consume
|
|
298
313
|
* across all stages combined. Defaults to 40 (stages × toolMaxTurns).
|
|
299
314
|
* Runs that exceed the cap fail the current stage with a
|
|
300
315
|
* `MaxTurnsExceededError` propagated from the underlying with-tools
|
|
@@ -325,6 +340,18 @@ export interface BaselineSnapshot {
|
|
|
325
340
|
cclawVersion: string;
|
|
326
341
|
/** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
|
|
327
342
|
cases: Record<string, BaselineCaseEntry>;
|
|
343
|
+
/**
|
|
344
|
+
* Tamper-evident signature computed as sha256 over the canonical JSON
|
|
345
|
+
* encoding of `{ schemaVersion, stage, cases }`. Present on files
|
|
346
|
+
* written by cclaw >= 0.28.0; older baselines load with `signature`
|
|
347
|
+
* absent and the loader skips verification.
|
|
348
|
+
*/
|
|
349
|
+
signature?: {
|
|
350
|
+
algorithm: "sha256";
|
|
351
|
+
digest: string;
|
|
352
|
+
/** ISO timestamp of when the digest was computed. */
|
|
353
|
+
signedAt: string;
|
|
354
|
+
};
|
|
328
355
|
}
|
|
329
356
|
export interface BaselineCaseEntry {
|
|
330
357
|
passed: boolean;
|
|
@@ -415,7 +442,7 @@ export interface JudgeInvocation {
|
|
|
415
442
|
durationMs: number;
|
|
416
443
|
}
|
|
417
444
|
/**
|
|
418
|
-
* Tool-use summary produced by the
|
|
445
|
+
* Tool-use summary produced by the with-tools agent loop. Captured so
|
|
419
446
|
* the runner can surface per-case tool metrics in the markdown report
|
|
420
447
|
* (number of calls, depth, error rate, denied paths).
|
|
421
448
|
*/
|
|
@@ -432,7 +459,7 @@ export interface ToolUseSummary {
|
|
|
432
459
|
byTool: Record<string, number>;
|
|
433
460
|
}
|
|
434
461
|
/**
|
|
435
|
-
* Cross-stage consistency expectations for a
|
|
462
|
+
* Cross-stage consistency expectations for a workflow-mode case. Every
|
|
436
463
|
* sub-check is optional so authors can opt in incrementally; an empty
|
|
437
464
|
* block produces zero verifier results.
|
|
438
465
|
*/
|
|
@@ -470,8 +497,8 @@ export interface WorkflowConsistencyExpected {
|
|
|
470
497
|
}>;
|
|
471
498
|
}
|
|
472
499
|
/**
|
|
473
|
-
* A single stage step inside a
|
|
474
|
-
* `inputPrompt` is handed to the
|
|
500
|
+
* A single stage step inside a workflow-mode case. The stage's
|
|
501
|
+
* `inputPrompt` is handed to the with-tools agent loop with prior-stage
|
|
475
502
|
* artifacts seeded into the sandbox under `stages/<name>.md`.
|
|
476
503
|
*/
|
|
477
504
|
export interface WorkflowStageStep {
|
|
@@ -485,14 +512,14 @@ export interface WorkflowStageStep {
|
|
|
485
512
|
minimumScores?: Record<string, number>;
|
|
486
513
|
}
|
|
487
514
|
/**
|
|
488
|
-
* Supported workflow stages. Deliberately a subset of `FlowStage` —
|
|
489
|
-
*
|
|
515
|
+
* Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
|
|
516
|
+
* the workflow mode covers the early "design" arc of a project. TDD/review/ship
|
|
490
517
|
* are out of scope (they require real code execution).
|
|
491
518
|
*/
|
|
492
519
|
export declare const WORKFLOW_STAGES: readonly ["brainstorm", "scope", "design", "spec", "plan"];
|
|
493
520
|
export type WorkflowStageName = (typeof WORKFLOW_STAGES)[number];
|
|
494
521
|
/**
|
|
495
|
-
* A
|
|
522
|
+
* A workflow-mode case. Lives under
|
|
496
523
|
* `.cclaw/evals/corpus/workflows/<id>.yaml` and wires a multi-stage run
|
|
497
524
|
* through the with-tools agent.
|
|
498
525
|
*/
|
|
@@ -504,10 +531,10 @@ export interface WorkflowCase {
|
|
|
504
531
|
contextFiles?: string[];
|
|
505
532
|
/** Ordered list of stages to run. Must be non-empty. */
|
|
506
533
|
stages: WorkflowStageStep[];
|
|
507
|
-
/** Cross-stage consistency checks (
|
|
534
|
+
/** Cross-stage consistency checks (workflow-mode verifier family). */
|
|
508
535
|
consistency?: WorkflowConsistencyExpected;
|
|
509
536
|
}
|
|
510
|
-
/** Per-stage record inside a
|
|
537
|
+
/** Per-stage record inside a workflow-mode run. */
|
|
511
538
|
export interface WorkflowStageResult {
|
|
512
539
|
stage: WorkflowStageName;
|
|
513
540
|
artifact: string;
|
|
@@ -523,7 +550,7 @@ export interface WorkflowStageResult {
|
|
|
523
550
|
/** Per-rubric-check medians keyed by check id (for the report). */
|
|
524
551
|
judgeMedians?: Record<string, number>;
|
|
525
552
|
}
|
|
526
|
-
/**
|
|
553
|
+
/** Workflow-mode orchestration output collected by the runner. */
|
|
527
554
|
export interface WorkflowRunSummary {
|
|
528
555
|
caseId: string;
|
|
529
556
|
stages: WorkflowStageResult[];
|
package/dist/eval/types.js
CHANGED
|
@@ -1,17 +1,29 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
2
|
+
* Evaluation mode — what the agent-under-test actually does.
|
|
3
3
|
*
|
|
4
|
-
* - `
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
4
|
+
* - `fixture` — verify an existing artifact against structural/rule/judge
|
|
5
|
+
* expectations. No LLM drafting, only verifiers (judge may still invoke
|
|
6
|
+
* the API). Cheapest mode.
|
|
7
|
+
* - `agent` — LLM drafts a single-stage artifact inside a sandbox using the
|
|
8
|
+
* function-calling loop (read_file/write_file/glob/grep). Replaces the
|
|
9
|
+
* previous single-shot path entirely.
|
|
10
|
+
* - `workflow` — LLM orchestrates the full multi-stage flow
|
|
11
|
+
* (brainstorm → scope → design → spec → plan) with threaded artifacts.
|
|
12
|
+
*
|
|
13
|
+
* Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
|
|
14
|
+
* a deprecation warning — see `src/eval/mode.ts` for the mapping.
|
|
15
|
+
*/
|
|
16
|
+
export const EVAL_MODES = ["fixture", "agent", "workflow"];
|
|
17
|
+
/**
|
|
18
|
+
* Legacy tier identifier, kept so on-disk reports generated before v0.28.0
|
|
19
|
+
* keep parsing. New code should always use `EvalMode`.
|
|
20
|
+
* @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
|
|
9
21
|
*/
|
|
10
22
|
export const EVAL_TIERS = ["A", "B", "C"];
|
|
11
23
|
/**
|
|
12
24
|
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
13
25
|
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
14
|
-
* `consistency` is the
|
|
26
|
+
* `consistency` is the workflow-mode cross-artifact family (deterministic but
|
|
15
27
|
* operates over multiple artifacts at once).
|
|
16
28
|
*/
|
|
17
29
|
export const VERIFIER_KINDS = [
|
|
@@ -22,8 +34,8 @@ export const VERIFIER_KINDS = [
|
|
|
22
34
|
"consistency"
|
|
23
35
|
];
|
|
24
36
|
/**
|
|
25
|
-
* Supported workflow stages. Deliberately a subset of `FlowStage` —
|
|
26
|
-
*
|
|
37
|
+
* Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
|
|
38
|
+
* the workflow mode covers the early "design" arc of a project. TDD/review/ship
|
|
27
39
|
* are out of scope (they require real code execution).
|
|
28
40
|
*/
|
|
29
41
|
export const WORKFLOW_STAGES = [
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import type { WorkflowCase } from "./types.js";
|
|
2
2
|
/**
|
|
3
|
-
* Load every
|
|
3
|
+
* Load every workflow-mode case under
|
|
4
4
|
* `.cclaw/evals/corpus/workflows/*.yaml`. Returns an empty array when the
|
|
5
|
-
* directory is missing — a fresh `cclaw init` has no
|
|
5
|
+
* directory is missing — a fresh `cclaw init` has no workflow corpus yet.
|
|
6
6
|
*/
|
|
7
7
|
export declare function loadWorkflowCorpus(projectRoot: string): Promise<WorkflowCase[]>;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Workflow corpus loader
|
|
2
|
+
* Workflow corpus loader (workflow mode).
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Workflow-mode cases live under `.cclaw/evals/corpus/workflows/<id>.yaml` and
|
|
5
5
|
* describe a multi-stage run that chains the with-tools agent across
|
|
6
6
|
* `brainstorm → scope → design → spec → plan`. Unlike single-stage
|
|
7
7
|
* cases (which are keyed by stage folder), workflow cases ship as a
|
|
@@ -177,9 +177,9 @@ function validateWorkflowCase(filePath, raw) {
|
|
|
177
177
|
return out;
|
|
178
178
|
}
|
|
179
179
|
/**
|
|
180
|
-
* Load every
|
|
180
|
+
* Load every workflow-mode case under
|
|
181
181
|
* `.cclaw/evals/corpus/workflows/*.yaml`. Returns an empty array when the
|
|
182
|
-
* directory is missing — a fresh `cclaw init` has no
|
|
182
|
+
* directory is missing — a fresh `cclaw init` has no workflow corpus yet.
|
|
183
183
|
*/
|
|
184
184
|
export async function loadWorkflowCorpus(projectRoot) {
|
|
185
185
|
const dir = path.join(projectRoot, EVALS_ROOT, "corpus", "workflows");
|
package/dist/install.d.ts
CHANGED
|
@@ -8,5 +8,15 @@ export interface InitOptions {
|
|
|
8
8
|
}
|
|
9
9
|
export declare function initCclaw(options: InitOptions): Promise<void>;
|
|
10
10
|
export declare function syncCclaw(projectRoot: string): Promise<void>;
|
|
11
|
+
/**
|
|
12
|
+
* Refresh generated files in `.cclaw/` without touching user-authored
|
|
13
|
+
* artifacts, state, or custom config keys. Only the `version` + `flowVersion`
|
|
14
|
+
* stamps are rewritten so the on-disk config reflects the installed CLI;
|
|
15
|
+
* `promptGuardMode`, `tddEnforcement`, `gitHookGuards`, `languageRulePacks`,
|
|
16
|
+
* and `trackHeuristics` are preserved verbatim from the existing config.
|
|
17
|
+
*
|
|
18
|
+
* For an explicit reset to the default profile the user should reinstall via
|
|
19
|
+
* `cclaw init --profile=<id>` (after optionally archiving the current run).
|
|
20
|
+
*/
|
|
11
21
|
export declare function upgradeCclaw(projectRoot: string): Promise<void>;
|
|
12
22
|
export declare function uninstallCclaw(projectRoot: string): Promise<void>;
|
package/dist/install.js
CHANGED
|
@@ -2,7 +2,7 @@ import { execFile } from "node:child_process";
|
|
|
2
2
|
import fs from "node:fs/promises";
|
|
3
3
|
import path from "node:path";
|
|
4
4
|
import { promisify } from "node:util";
|
|
5
|
-
import { COMMAND_FILE_ORDER, REQUIRED_DIRS, RUNTIME_ROOT } from "./constants.js";
|
|
5
|
+
import { CCLAW_VERSION, COMMAND_FILE_ORDER, FLOW_VERSION, REQUIRED_DIRS, RUNTIME_ROOT } from "./constants.js";
|
|
6
6
|
import { writeConfig, createDefaultConfig, createProfileConfig, readConfig, configPath } from "./config.js";
|
|
7
7
|
import { commandContract } from "./content/contracts.js";
|
|
8
8
|
import { contextModeFiles, createInitialContextModeState } from "./content/contexts.js";
|
|
@@ -1104,11 +1104,25 @@ export async function syncCclaw(projectRoot) {
|
|
|
1104
1104
|
}
|
|
1105
1105
|
await materializeRuntime(projectRoot, config, false);
|
|
1106
1106
|
}
|
|
1107
|
+
/**
|
|
1108
|
+
* Refresh generated files in `.cclaw/` without touching user-authored
|
|
1109
|
+
* artifacts, state, or custom config keys. Only the `version` + `flowVersion`
|
|
1110
|
+
* stamps are rewritten so the on-disk config reflects the installed CLI;
|
|
1111
|
+
* `promptGuardMode`, `tddEnforcement`, `gitHookGuards`, `languageRulePacks`,
|
|
1112
|
+
* and `trackHeuristics` are preserved verbatim from the existing config.
|
|
1113
|
+
*
|
|
1114
|
+
* For an explicit reset to the default profile the user should reinstall via
|
|
1115
|
+
* `cclaw init --profile=<id>` (after optionally archiving the current run).
|
|
1116
|
+
*/
|
|
1107
1117
|
export async function upgradeCclaw(projectRoot) {
|
|
1108
|
-
const
|
|
1109
|
-
const
|
|
1110
|
-
|
|
1111
|
-
|
|
1118
|
+
const existing = await readConfig(projectRoot);
|
|
1119
|
+
const upgraded = {
|
|
1120
|
+
...existing,
|
|
1121
|
+
version: CCLAW_VERSION,
|
|
1122
|
+
flowVersion: FLOW_VERSION
|
|
1123
|
+
};
|
|
1124
|
+
await writeConfig(projectRoot, upgraded);
|
|
1125
|
+
await materializeRuntime(projectRoot, upgraded, false);
|
|
1112
1126
|
}
|
|
1113
1127
|
function stripManagedHookCommands(value) {
|
|
1114
1128
|
if (!value || typeof value !== "object" || Array.isArray(value)) {
|