cclaw-cli 0.49.0 → 0.51.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -84
- package/dist/artifact-linter.d.ts +4 -0
- package/dist/artifact-linter.js +24 -3
- package/dist/cli.d.ts +1 -19
- package/dist/cli.js +49 -491
- package/dist/constants.d.ts +2 -13
- package/dist/constants.js +1 -43
- package/dist/content/closeout-guidance.d.ts +14 -0
- package/dist/content/closeout-guidance.js +42 -0
- package/dist/content/core-agents.js +55 -17
- package/dist/content/decision-protocol.d.ts +12 -0
- package/dist/content/decision-protocol.js +20 -0
- package/dist/content/diff-command.d.ts +1 -2
- package/dist/content/diff-command.js +8 -94
- package/dist/content/examples.d.ts +4 -10
- package/dist/content/examples.js +10 -20
- package/dist/content/hook-events.js +2 -2
- package/dist/content/hook-inline-snippets.d.ts +5 -2
- package/dist/content/hook-inline-snippets.js +33 -1
- package/dist/content/hook-manifest.d.ts +3 -4
- package/dist/content/hook-manifest.js +11 -12
- package/dist/content/hooks.js +44 -21
- package/dist/content/ideate-command.d.ts +2 -0
- package/dist/content/ideate-command.js +34 -25
- package/dist/content/iron-laws.d.ts +5 -5
- package/dist/content/iron-laws.js +5 -5
- package/dist/content/language-policy.d.ts +2 -0
- package/dist/content/language-policy.js +13 -0
- package/dist/content/learnings.d.ts +3 -4
- package/dist/content/learnings.js +26 -50
- package/dist/content/meta-skill.js +33 -22
- package/dist/content/next-command.js +41 -38
- package/dist/content/node-hooks.js +17 -345
- package/dist/content/opencode-plugin.js +5 -103
- package/dist/content/research-playbooks.js +14 -14
- package/dist/content/review-loop.d.ts +2 -0
- package/dist/content/review-loop.js +8 -0
- package/dist/content/session-hooks.js +15 -47
- package/dist/content/skills.d.ts +0 -5
- package/dist/content/skills.js +55 -128
- package/dist/content/stage-common-guidance.d.ts +0 -1
- package/dist/content/stage-common-guidance.js +17 -14
- package/dist/content/stage-schema.d.ts +26 -1
- package/dist/content/stage-schema.js +121 -40
- package/dist/content/stages/_lint-metadata/index.js +9 -15
- package/dist/content/stages/brainstorm.js +22 -43
- package/dist/content/stages/design.js +37 -57
- package/dist/content/stages/plan.js +22 -13
- package/dist/content/stages/review.js +24 -27
- package/dist/content/stages/scope.js +34 -46
- package/dist/content/stages/ship.js +7 -4
- package/dist/content/stages/spec.js +20 -9
- package/dist/content/stages/tdd.js +64 -44
- package/dist/content/start-command.js +13 -12
- package/dist/content/status-command.d.ts +2 -7
- package/dist/content/status-command.js +19 -146
- package/dist/content/subagents.d.ts +0 -5
- package/dist/content/subagents.js +51 -28
- package/dist/content/templates.d.ts +1 -1
- package/dist/content/templates.js +126 -135
- package/dist/content/track-render-context.d.ts +17 -0
- package/dist/content/track-render-context.js +44 -0
- package/dist/content/tree-command.d.ts +1 -2
- package/dist/content/tree-command.js +4 -87
- package/dist/content/utility-skills.d.ts +2 -29
- package/dist/content/utility-skills.js +2 -1534
- package/dist/content/view-command.js +31 -11
- package/dist/delegation.d.ts +1 -1
- package/dist/delegation.js +5 -15
- package/dist/doctor-registry.js +20 -21
- package/dist/doctor.js +88 -344
- package/dist/flow-state.d.ts +3 -0
- package/dist/flow-state.js +2 -0
- package/dist/harness-adapters.d.ts +1 -1
- package/dist/harness-adapters.js +51 -58
- package/dist/install.js +128 -358
- package/dist/internal/advance-stage.js +3 -9
- package/dist/internal/compound-readiness.d.ts +1 -1
- package/dist/internal/compound-readiness.js +1 -1
- package/dist/internal/tdd-loop-status.d.ts +1 -1
- package/dist/internal/tdd-loop-status.js +1 -1
- package/dist/knowledge-store.d.ts +16 -10
- package/dist/knowledge-store.js +51 -15
- package/dist/policy.js +16 -105
- package/dist/run-archive.d.ts +4 -6
- package/dist/run-archive.js +15 -20
- package/dist/run-persistence.d.ts +2 -2
- package/dist/run-persistence.js +3 -9
- package/package.json +1 -2
- package/dist/content/archive-command.d.ts +0 -2
- package/dist/content/archive-command.js +0 -124
- package/dist/content/compound-command.d.ts +0 -5
- package/dist/content/compound-command.js +0 -193
- package/dist/content/contexts.d.ts +0 -18
- package/dist/content/contexts.js +0 -24
- package/dist/content/contracts.d.ts +0 -2
- package/dist/content/contracts.js +0 -51
- package/dist/content/doctor-references.d.ts +0 -2
- package/dist/content/doctor-references.js +0 -150
- package/dist/content/eval-scaffold.d.ts +0 -15
- package/dist/content/eval-scaffold.js +0 -370
- package/dist/content/feature-command.d.ts +0 -2
- package/dist/content/feature-command.js +0 -123
- package/dist/content/flow-map.d.ts +0 -23
- package/dist/content/flow-map.js +0 -134
- package/dist/content/harness-doc.d.ts +0 -2
- package/dist/content/harness-doc.js +0 -202
- package/dist/content/harness-playbooks.d.ts +0 -24
- package/dist/content/harness-playbooks.js +0 -393
- package/dist/content/harness-tool-refs.d.ts +0 -20
- package/dist/content/harness-tool-refs.js +0 -268
- package/dist/content/ops-command.d.ts +0 -2
- package/dist/content/ops-command.js +0 -71
- package/dist/content/protocols.d.ts +0 -7
- package/dist/content/protocols.js +0 -215
- package/dist/content/retro-command.d.ts +0 -2
- package/dist/content/retro-command.js +0 -165
- package/dist/content/rewind-command.d.ts +0 -2
- package/dist/content/rewind-command.js +0 -106
- package/dist/content/tdd-log-command.d.ts +0 -2
- package/dist/content/tdd-log-command.js +0 -85
- package/dist/eval/agents/single-shot.d.ts +0 -27
- package/dist/eval/agents/single-shot.js +0 -79
- package/dist/eval/agents/with-tools.d.ts +0 -44
- package/dist/eval/agents/with-tools.js +0 -261
- package/dist/eval/agents/workflow.d.ts +0 -31
- package/dist/eval/agents/workflow.js +0 -155
- package/dist/eval/baseline.d.ts +0 -38
- package/dist/eval/baseline.js +0 -282
- package/dist/eval/config-loader.d.ts +0 -14
- package/dist/eval/config-loader.js +0 -395
- package/dist/eval/corpus.d.ts +0 -30
- package/dist/eval/corpus.js +0 -330
- package/dist/eval/cost-guard.d.ts +0 -102
- package/dist/eval/cost-guard.js +0 -190
- package/dist/eval/diff.d.ts +0 -64
- package/dist/eval/diff.js +0 -323
- package/dist/eval/llm-client.d.ts +0 -176
- package/dist/eval/llm-client.js +0 -267
- package/dist/eval/mode.d.ts +0 -28
- package/dist/eval/mode.js +0 -61
- package/dist/eval/progress.d.ts +0 -83
- package/dist/eval/progress.js +0 -59
- package/dist/eval/report.d.ts +0 -11
- package/dist/eval/report.js +0 -181
- package/dist/eval/rubric-loader.d.ts +0 -20
- package/dist/eval/rubric-loader.js +0 -143
- package/dist/eval/runner.d.ts +0 -81
- package/dist/eval/runner.js +0 -746
- package/dist/eval/runs.d.ts +0 -41
- package/dist/eval/runs.js +0 -114
- package/dist/eval/sandbox.d.ts +0 -38
- package/dist/eval/sandbox.js +0 -137
- package/dist/eval/tools/glob.d.ts +0 -2
- package/dist/eval/tools/glob.js +0 -163
- package/dist/eval/tools/grep.d.ts +0 -2
- package/dist/eval/tools/grep.js +0 -152
- package/dist/eval/tools/index.d.ts +0 -7
- package/dist/eval/tools/index.js +0 -35
- package/dist/eval/tools/read.d.ts +0 -2
- package/dist/eval/tools/read.js +0 -122
- package/dist/eval/tools/types.d.ts +0 -49
- package/dist/eval/tools/types.js +0 -41
- package/dist/eval/tools/write.d.ts +0 -2
- package/dist/eval/tools/write.js +0 -92
- package/dist/eval/types.d.ts +0 -561
- package/dist/eval/types.js +0 -47
- package/dist/eval/verifiers/judge.d.ts +0 -40
- package/dist/eval/verifiers/judge.js +0 -256
- package/dist/eval/verifiers/rules.d.ts +0 -24
- package/dist/eval/verifiers/rules.js +0 -218
- package/dist/eval/verifiers/structural.d.ts +0 -14
- package/dist/eval/verifiers/structural.js +0 -171
- package/dist/eval/verifiers/traceability.d.ts +0 -23
- package/dist/eval/verifiers/traceability.js +0 -84
- package/dist/eval/verifiers/workflow-consistency.d.ts +0 -21
- package/dist/eval/verifiers/workflow-consistency.js +0 -225
- package/dist/eval/workflow-corpus.d.ts +0 -7
- package/dist/eval/workflow-corpus.js +0 -207
- package/dist/feature-system.d.ts +0 -42
- package/dist/feature-system.js +0 -432
- package/dist/internal/knowledge-digest.d.ts +0 -7
- package/dist/internal/knowledge-digest.js +0 -93
package/dist/eval/types.d.ts
DELETED
|
@@ -1,561 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Core types for the cclaw eval subsystem (Phase 7).
|
|
3
|
-
*
|
|
4
|
-
* The eval subsystem lets us measure whether a change to a prompt, skill, or
|
|
5
|
-
* stage contract improves or regresses the quality of agent output. It is
|
|
6
|
-
* deliberately decoupled from the main cclaw runtime so that:
|
|
7
|
-
*
|
|
8
|
-
* - Users who never run `cclaw eval` pay zero runtime cost.
|
|
9
|
-
* - The verifier / rubric / LLM stack evolves on its own release cadence (Steps 0-6).
|
|
10
|
-
* - Any OpenAI-compatible endpoint can be swapped in via config (z.ai, OpenAI, vLLM, etc.).
|
|
11
|
-
*/
|
|
12
|
-
import type { FlowStage } from "../types.js";
|
|
13
|
-
/**
|
|
14
|
-
* Evaluation mode — what the agent-under-test actually does.
|
|
15
|
-
*
|
|
16
|
-
* - `fixture` — verify an existing artifact against structural/rule/judge
|
|
17
|
-
* expectations. No LLM drafting, only verifiers (judge may still invoke
|
|
18
|
-
* the API). Cheapest mode.
|
|
19
|
-
* - `agent` — LLM drafts a single-stage artifact inside a sandbox using the
|
|
20
|
-
* function-calling loop (read_file/write_file/glob/grep). Replaces the
|
|
21
|
-
* previous single-shot path entirely.
|
|
22
|
-
* - `workflow` — LLM orchestrates the full multi-stage flow
|
|
23
|
-
* (brainstorm → scope → design → spec → plan) with threaded artifacts.
|
|
24
|
-
*
|
|
25
|
-
* Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
|
|
26
|
-
* a deprecation warning — see `src/eval/mode.ts` for the mapping.
|
|
27
|
-
*/
|
|
28
|
-
export declare const EVAL_MODES: readonly ["fixture", "agent", "workflow"];
|
|
29
|
-
export type EvalMode = (typeof EVAL_MODES)[number];
|
|
30
|
-
/**
|
|
31
|
-
* Legacy tier identifier, kept so on-disk reports generated before v0.28.0
|
|
32
|
-
* keep parsing. New code should always use `EvalMode`.
|
|
33
|
-
* @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
|
|
34
|
-
*/
|
|
35
|
-
export declare const EVAL_TIERS: readonly ["A", "B", "C"];
|
|
36
|
-
/** @deprecated use `EvalMode`. */
|
|
37
|
-
export type EvalTier = (typeof EVAL_TIERS)[number];
|
|
38
|
-
/**
|
|
39
|
-
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
40
|
-
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
41
|
-
* `consistency` is the workflow-mode cross-artifact family (deterministic but
|
|
42
|
-
* operates over multiple artifacts at once).
|
|
43
|
-
*/
|
|
44
|
-
export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "workflow", "consistency"];
|
|
45
|
-
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
46
|
-
/**
|
|
47
|
-
* Structural expectations — deterministic, LLM-free checks against a single
|
|
48
|
-
* text artifact. Step 1 implements all fields below; Step 2 adds the
|
|
49
|
-
* sibling `rules` shape, Step 3 adds `judge`.
|
|
50
|
-
*/
|
|
51
|
-
export interface StructuralExpected {
|
|
52
|
-
/**
|
|
53
|
-
* Case-insensitive substrings that must each appear on at least one markdown
|
|
54
|
-
* heading line (line starting with `#`). Useful for "required sections".
|
|
55
|
-
*/
|
|
56
|
-
requiredSections?: string[];
|
|
57
|
-
/**
|
|
58
|
-
* Case-insensitive substrings that must NOT appear anywhere in the body
|
|
59
|
-
* (headings or prose). Typical entries: "TBD", "TODO", "placeholder".
|
|
60
|
-
*/
|
|
61
|
-
forbiddenPatterns?: string[];
|
|
62
|
-
/** Inclusive minimum line count of the artifact body (frontmatter excluded). */
|
|
63
|
-
minLines?: number;
|
|
64
|
-
/** Inclusive maximum line count of the artifact body (frontmatter excluded). */
|
|
65
|
-
maxLines?: number;
|
|
66
|
-
/** Inclusive minimum character count of the artifact body. */
|
|
67
|
-
minChars?: number;
|
|
68
|
-
/** Inclusive maximum character count of the artifact body. */
|
|
69
|
-
maxChars?: number;
|
|
70
|
-
/**
|
|
71
|
-
* Keys that must appear in the leading YAML frontmatter (between a pair of
|
|
72
|
-
* `---` delimiters at the very top of the file). An artifact without
|
|
73
|
-
* frontmatter will fail every entry.
|
|
74
|
-
*/
|
|
75
|
-
requiredFrontmatterKeys?: string[];
|
|
76
|
-
}
|
|
77
|
-
/**
|
|
78
|
-
* Rule-based expectations — zero-LLM content checks that are richer than
|
|
79
|
-
* structural (regex, numeric bounds, uniqueness). Introduced in Step 2.
|
|
80
|
-
*
|
|
81
|
-
* Every array field is optional; an empty `RulesExpected` produces zero
|
|
82
|
-
* verifier results so authors can enable rules incrementally.
|
|
83
|
-
*/
|
|
84
|
-
export interface RulesExpected {
|
|
85
|
-
/** Case-insensitive substrings the body must include at least once. */
|
|
86
|
-
mustContain?: string[];
|
|
87
|
-
/** Case-insensitive substrings the body must NOT include. */
|
|
88
|
-
mustNotContain?: string[];
|
|
89
|
-
/** Regex patterns that must match the body at least once. */
|
|
90
|
-
regexRequired?: RuleRegex[];
|
|
91
|
-
/** Regex patterns that must NOT match the body. */
|
|
92
|
-
regexForbidden?: RuleRegex[];
|
|
93
|
-
/** For each substring key, the body must contain at least N occurrences. */
|
|
94
|
-
minOccurrences?: Record<string, number>;
|
|
95
|
-
/** For each substring key, the body must contain at most N occurrences. */
|
|
96
|
-
maxOccurrences?: Record<string, number>;
|
|
97
|
-
/**
|
|
98
|
-
* For each named section (case-insensitive heading substring), every bullet
|
|
99
|
-
* (`- ...`) directly under the section must be unique. Catches duplicated
|
|
100
|
-
* decisions or repeated risks.
|
|
101
|
-
*/
|
|
102
|
-
uniqueBulletsInSection?: string[];
|
|
103
|
-
}
|
|
104
|
-
export interface RuleRegex {
|
|
105
|
-
/** Source of the regex. Parsed with `new RegExp(pattern, flags)`. */
|
|
106
|
-
pattern: string;
|
|
107
|
-
/** Optional regex flags; defaults to `"i"` for case-insensitive matching. */
|
|
108
|
-
flags?: string;
|
|
109
|
-
/** Human-readable label rendered in verifier messages and slugged into the id. */
|
|
110
|
-
description?: string;
|
|
111
|
-
}
|
|
112
|
-
/**
|
|
113
|
-
* Cross-stage traceability expectations — assert every ID extracted from
|
|
114
|
-
* `source` also appears in `self` and/or named `extra_fixtures`. Introduced
|
|
115
|
-
* in Step 2.
|
|
116
|
-
*/
|
|
117
|
-
export interface TraceabilityExpected {
|
|
118
|
-
/** Regex applied to the `source` fixture to collect the authoritative ID set. */
|
|
119
|
-
idPattern: string;
|
|
120
|
-
/** Optional regex flags (defaults to `"g"`). */
|
|
121
|
-
idFlags?: string;
|
|
122
|
-
/**
|
|
123
|
-
* Where to read the authoritative ID set from. Either `"self"` (the case's
|
|
124
|
-
* primary `fixture`) or a label present in the case's `extraFixtures` map.
|
|
125
|
-
*/
|
|
126
|
-
source: string;
|
|
127
|
-
/**
|
|
128
|
-
* Where every source ID must also appear. Each entry is `"self"` or an
|
|
129
|
-
* `extraFixtures` label. Order is preserved for deterministic result ids.
|
|
130
|
-
*/
|
|
131
|
-
requireIn: string[];
|
|
132
|
-
}
|
|
133
|
-
/**
|
|
134
|
-
* LLM-judge expectations — Step 3.
|
|
135
|
-
*
|
|
136
|
-
* When present, the judge runs against the resolved artifact (live-agent
|
|
137
|
-
* output in agent/workflow mode, or the pre-generated fixture when `--judge` is
|
|
138
|
-
* combined with `--schema-only` for smoke tests). Every field below is
|
|
139
|
-
* optional; the case-level hint overlays the stage-level rubric loaded
|
|
140
|
-
* from `.cclaw/evals/rubrics/<stage>.yaml`.
|
|
141
|
-
*/
|
|
142
|
-
export interface JudgeExpected {
|
|
143
|
-
/**
|
|
144
|
-
* Per-case check ids that MUST be present in the stage rubric. Used when
|
|
145
|
-
* a case wants to assert the rubric covers scenario-specific properties.
|
|
146
|
-
*/
|
|
147
|
-
requiredChecks?: string[];
|
|
148
|
-
/**
|
|
149
|
-
* Stage rubric identifier when a stage ships multiple rubrics (e.g.
|
|
150
|
-
* "strict" vs. "lenient"). Defaults to the stage name.
|
|
151
|
-
*/
|
|
152
|
-
rubric?: string;
|
|
153
|
-
/** Optional override of `config.judgeSamples` for the case. */
|
|
154
|
-
samples?: number;
|
|
155
|
-
/** Per-check minimum score (1..5 scale). Fail when any score drops below. */
|
|
156
|
-
minimumScores?: Record<string, number>;
|
|
157
|
-
}
|
|
158
|
-
/** Superset of per-verifier expectation shapes. */
|
|
159
|
-
export interface ExpectedShape {
|
|
160
|
-
structural?: StructuralExpected;
|
|
161
|
-
/** Rule-based (keyword/regex/count/uniqueness) checks — Step 2. */
|
|
162
|
-
rules?: RulesExpected;
|
|
163
|
-
/** Cross-stage ID propagation checks — Step 2. */
|
|
164
|
-
traceability?: TraceabilityExpected;
|
|
165
|
-
/** LLM-judge rubrics — Step 3. */
|
|
166
|
-
judge?: JudgeExpected;
|
|
167
|
-
}
|
|
168
|
-
/**
|
|
169
|
-
* A single eval case describes one input scenario for one stage. Cases live in
|
|
170
|
-
* `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
|
|
171
|
-
* fixture artifact for verifier development (Step 1) before the agent loop
|
|
172
|
-
* exists (Step 3+).
|
|
173
|
-
*/
|
|
174
|
-
export interface EvalCase {
|
|
175
|
-
id: string;
|
|
176
|
-
stage: FlowStage;
|
|
177
|
-
inputPrompt: string;
|
|
178
|
-
/** Project files copied into the agent/workflow sandbox before the agent runs. */
|
|
179
|
-
contextFiles?: string[];
|
|
180
|
-
/**
|
|
181
|
-
* Typed expectation hints consumed by the structural/rules/judge verifiers.
|
|
182
|
-
* Each sub-shape is optional; missing sub-shapes skip that verifier tier.
|
|
183
|
-
*/
|
|
184
|
-
expected?: ExpectedShape;
|
|
185
|
-
/**
|
|
186
|
-
* Path (relative to the corpus case file) of a pre-generated artifact used
|
|
187
|
-
* when verifiers are exercised without a live agent loop. Primarily a
|
|
188
|
-
* Step 1 development aid.
|
|
189
|
-
*/
|
|
190
|
-
fixture?: string;
|
|
191
|
-
/**
|
|
192
|
-
* Additional fixture paths loaded alongside the primary `fixture`, keyed
|
|
193
|
-
* by a free-form label. Consumed by cross-artifact verifiers (e.g.,
|
|
194
|
-
* traceability) introduced in Step 2. Paths are resolved relative to the
|
|
195
|
-
* case's stage directory, just like `fixture`.
|
|
196
|
-
*/
|
|
197
|
-
extraFixtures?: Record<string, string>;
|
|
198
|
-
}
|
|
199
|
-
/** Result of one verifier applied to one case. */
|
|
200
|
-
export interface VerifierResult {
|
|
201
|
-
kind: VerifierKind;
|
|
202
|
-
id: string;
|
|
203
|
-
ok: boolean;
|
|
204
|
-
/** Normalized 0..1 score when the verifier produces a numeric signal. */
|
|
205
|
-
score?: number;
|
|
206
|
-
message?: string;
|
|
207
|
-
details?: Record<string, unknown>;
|
|
208
|
-
}
|
|
209
|
-
/** Aggregate result for one case after all verifiers run. */
|
|
210
|
-
export interface EvalCaseResult {
|
|
211
|
-
caseId: string;
|
|
212
|
-
stage: FlowStage;
|
|
213
|
-
mode: EvalMode;
|
|
214
|
-
passed: boolean;
|
|
215
|
-
durationMs: number;
|
|
216
|
-
costUsd?: number;
|
|
217
|
-
verifierResults: VerifierResult[];
|
|
218
|
-
/**
|
|
219
|
-
* Only populated in `workflow` mode: per-stage breakdown collected by
|
|
220
|
-
* the workflow orchestrator. Unset for `fixture` / `agent` modes so the
|
|
221
|
-
* on-disk JSON stays small.
|
|
222
|
-
*/
|
|
223
|
-
workflow?: WorkflowRunSummary;
|
|
224
|
-
}
|
|
225
|
-
/** Top-level eval report, serialized to JSON and rendered to Markdown. */
|
|
226
|
-
export interface EvalReport {
|
|
227
|
-
schemaVersion: 1;
|
|
228
|
-
generatedAt: string;
|
|
229
|
-
runId: string;
|
|
230
|
-
cclawVersion: string;
|
|
231
|
-
provider: string;
|
|
232
|
-
model: string;
|
|
233
|
-
mode: EvalMode;
|
|
234
|
-
stages: FlowStage[];
|
|
235
|
-
cases: EvalCaseResult[];
|
|
236
|
-
summary: {
|
|
237
|
-
totalCases: number;
|
|
238
|
-
passed: number;
|
|
239
|
-
failed: number;
|
|
240
|
-
skipped: number;
|
|
241
|
-
totalCostUsd: number;
|
|
242
|
-
totalDurationMs: number;
|
|
243
|
-
};
|
|
244
|
-
/** Present when comparing against a saved baseline (Step 1+). */
|
|
245
|
-
baselineDelta?: BaselineDelta;
|
|
246
|
-
}
|
|
247
|
-
/**
|
|
248
|
-
* Eval configuration, persisted to `.cclaw/evals/config.yaml` and mergeable
|
|
249
|
-
* with `CCLAW_EVAL_*` environment variables at runtime.
|
|
250
|
-
*/
|
|
251
|
-
export interface EvalConfig {
|
|
252
|
-
/**
|
|
253
|
-
* Free-form provider name used in reports. The actual HTTP protocol is
|
|
254
|
-
* determined by `baseUrl`, which is expected to be OpenAI-compatible.
|
|
255
|
-
*/
|
|
256
|
-
provider: string;
|
|
257
|
-
/** OpenAI-compatible base URL, e.g. `https://api.z.ai/api/coding/paas/v4`. */
|
|
258
|
-
baseUrl: string;
|
|
259
|
-
/** Model identifier for both agent-under-test and judge unless `judgeModel` overrides. */
|
|
260
|
-
model: string;
|
|
261
|
-
/** Optional separate model for the judge role. Defaults to `model`. */
|
|
262
|
-
judgeModel?: string;
|
|
263
|
-
/** Default mode when `--mode` is not supplied. */
|
|
264
|
-
defaultMode: EvalMode;
|
|
265
|
-
/** Optional hard stop on estimated USD spend per day. Unset = no cap. */
|
|
266
|
-
dailyUsdCap?: number;
|
|
267
|
-
/** Regression thresholds for CI gates. */
|
|
268
|
-
regression: {
|
|
269
|
-
/** Fail when overall score drops by more than this fraction (e.g. 0.15 = 15%). */
|
|
270
|
-
failIfDeltaBelow: number;
|
|
271
|
-
/** Fail when any single critical rubric drops below this absolute score. */
|
|
272
|
-
failIfCriticalBelow: number;
|
|
273
|
-
};
|
|
274
|
-
/** Per-agent-run timeout in milliseconds. */
|
|
275
|
-
timeoutMs: number;
|
|
276
|
-
/** Max retries per API call on transient failures. */
|
|
277
|
-
maxRetries: number;
|
|
278
|
-
/**
|
|
279
|
-
* Number of judge samples per case (median-of-N). Defaults to 3 when unset.
|
|
280
|
-
* Must be odd so a true median exists.
|
|
281
|
-
*/
|
|
282
|
-
judgeSamples?: number;
|
|
283
|
-
/** Sampling temperature for judge calls. Defaults to 0.0. */
|
|
284
|
-
judgeTemperature?: number;
|
|
285
|
-
/** Sampling temperature for the agent-under-test. Defaults to 0.2. */
|
|
286
|
-
agentTemperature?: number;
|
|
287
|
-
/**
|
|
288
|
-
* Optional per-model USD pricing used by the cost guard. Keys match
|
|
289
|
-
* `model` / `judgeModel`. Values in USD per 1K tokens, so
|
|
290
|
-
* `{ input: 0.0005, output: 0.0015 }` = $0.50 per 1M input tokens.
|
|
291
|
-
*/
|
|
292
|
-
tokenPricing?: Record<string, TokenPricing>;
|
|
293
|
-
/**
|
|
294
|
-
* Maximum assistant turns (tool_calls → tool result cycles) allowed by
|
|
295
|
-
* the with-tools agent loop (agent/workflow mode). Defaults to 8. Runs that
|
|
296
|
-
* exceed the cap fail with a `MaxTurnsExceededError` and surface as a
|
|
297
|
-
* workflow verifier result.
|
|
298
|
-
*/
|
|
299
|
-
toolMaxTurns?: number;
|
|
300
|
-
/**
|
|
301
|
-
* Per-invocation ceiling on tool call arguments bytes. Defends against
|
|
302
|
-
* runaway writes. Defaults to 64 KiB.
|
|
303
|
-
*/
|
|
304
|
-
toolMaxArgumentsBytes?: number;
|
|
305
|
-
/**
|
|
306
|
-
* Per-invocation ceiling on tool call result bytes returned to the
|
|
307
|
-
* model. Defaults to 32 KiB; longer results are truncated with a
|
|
308
|
-
* marker so the model sees the cutoff.
|
|
309
|
-
*/
|
|
310
|
-
toolMaxResultBytes?: number;
|
|
311
|
-
/**
|
|
312
|
-
* Maximum total turns a single workflow-mode case may consume
|
|
313
|
-
* across all stages combined. Defaults to 40 (stages × toolMaxTurns).
|
|
314
|
-
* Runs that exceed the cap fail the current stage with a
|
|
315
|
-
* `MaxTurnsExceededError` propagated from the underlying with-tools
|
|
316
|
-
* loop rather than a dedicated workflow-level error.
|
|
317
|
-
*/
|
|
318
|
-
workflowMaxTotalTurns?: number;
|
|
319
|
-
}
|
|
320
|
-
/** Per-model pricing schedule, expressed as USD per 1K tokens. */
|
|
321
|
-
export interface TokenPricing {
|
|
322
|
-
input: number;
|
|
323
|
-
output: number;
|
|
324
|
-
}
|
|
325
|
-
/** Resolved config with env overrides applied. */
|
|
326
|
-
export interface ResolvedEvalConfig extends EvalConfig {
|
|
327
|
-
apiKey?: string;
|
|
328
|
-
source: "default" | "file" | "env" | "file+env";
|
|
329
|
-
}
|
|
330
|
-
/**
|
|
331
|
-
* Frozen per-stage baseline used by regression gating (Step 1). Baselines
|
|
332
|
-
* are committed to git; `cclaw eval --update-baseline --confirm` rewrites
|
|
333
|
-
* them. The shape is intentionally flat so a quick `git diff` reveals what
|
|
334
|
-
* changed between runs.
|
|
335
|
-
*/
|
|
336
|
-
export interface BaselineSnapshot {
|
|
337
|
-
schemaVersion: 1;
|
|
338
|
-
stage: FlowStage;
|
|
339
|
-
generatedAt: string;
|
|
340
|
-
cclawVersion: string;
|
|
341
|
-
/** Keyed by `EvalCase.id` so unchanged cases produce zero diff. */
|
|
342
|
-
cases: Record<string, BaselineCaseEntry>;
|
|
343
|
-
/**
|
|
344
|
-
* Tamper-evident signature computed as sha256 over the canonical JSON
|
|
345
|
-
* encoding of `{ schemaVersion, stage, cases }`. Present on files
|
|
346
|
-
* written by cclaw >= 0.28.0; older baselines load with `signature`
|
|
347
|
-
* absent and the loader skips verification.
|
|
348
|
-
*/
|
|
349
|
-
signature?: {
|
|
350
|
-
algorithm: "sha256";
|
|
351
|
-
digest: string;
|
|
352
|
-
/** ISO timestamp of when the digest was computed. */
|
|
353
|
-
signedAt: string;
|
|
354
|
-
};
|
|
355
|
-
}
|
|
356
|
-
export interface BaselineCaseEntry {
|
|
357
|
-
passed: boolean;
|
|
358
|
-
verifierResults: BaselineVerifierEntry[];
|
|
359
|
-
}
|
|
360
|
-
export interface BaselineVerifierEntry {
|
|
361
|
-
id: string;
|
|
362
|
-
kind: VerifierKind;
|
|
363
|
-
ok: boolean;
|
|
364
|
-
score?: number;
|
|
365
|
-
}
|
|
366
|
-
/**
|
|
367
|
-
* Delta between a fresh report and the saved baseline. Populated when
|
|
368
|
-
* baselines exist on disk and the run covers matching cases.
|
|
369
|
-
*/
|
|
370
|
-
export interface BaselineDelta {
|
|
371
|
-
baselineId: string;
|
|
372
|
-
/** Fresh-score − baseline-score, bounded to [-1, 1]. */
|
|
373
|
-
scoreDelta: number;
|
|
374
|
-
/** Count of checks that flipped from `ok:true` to `ok:false`. */
|
|
375
|
-
criticalFailures: number;
|
|
376
|
-
/** Per-case regression details for the Markdown report. */
|
|
377
|
-
regressions: BaselineRegression[];
|
|
378
|
-
}
|
|
379
|
-
export interface BaselineRegression {
|
|
380
|
-
caseId: string;
|
|
381
|
-
stage: FlowStage;
|
|
382
|
-
verifierId: string;
|
|
383
|
-
reason: "newly-failing" | "case-now-failing" | "score-drop";
|
|
384
|
-
previousScore?: number;
|
|
385
|
-
currentScore?: number;
|
|
386
|
-
}
|
|
387
|
-
/**
|
|
388
|
-
* One rubric check evaluated by the LLM judge. Scored on a 1..5 scale;
|
|
389
|
-
* 5 means "the artifact fully meets the bar described by `prompt`".
|
|
390
|
-
*/
|
|
391
|
-
export interface RubricCheck {
|
|
392
|
-
/** Kebab-case slug, unique per rubric. Stable across runs. */
|
|
393
|
-
id: string;
|
|
394
|
-
/** Natural-language question posed to the judge. */
|
|
395
|
-
prompt: string;
|
|
396
|
-
/** Human-readable scale description rendered in judge prompts. */
|
|
397
|
-
scale?: string;
|
|
398
|
-
/** Relative weight for the stage's aggregate score. Defaults to 1.0. */
|
|
399
|
-
weight?: number;
|
|
400
|
-
/**
|
|
401
|
-
* When true, any sample below `config.regression.failIfCriticalBelow`
|
|
402
|
-
* flips the verifier to `ok:false` (not just a score drop).
|
|
403
|
-
*/
|
|
404
|
-
critical?: boolean;
|
|
405
|
-
}
|
|
406
|
-
/** Parsed `.cclaw/evals/rubrics/<stage>.yaml`. */
|
|
407
|
-
export interface RubricDoc {
|
|
408
|
-
stage: FlowStage;
|
|
409
|
-
/** Optional rubric variant label; defaults to the stage name. */
|
|
410
|
-
id: string;
|
|
411
|
-
checks: RubricCheck[];
|
|
412
|
-
}
|
|
413
|
-
/**
|
|
414
|
-
* Judge response for a single sample (one API call). The judge is asked to
|
|
415
|
-
* return structured JSON; `scores[id]` maps rubric check id → integer 1..5.
|
|
416
|
-
* `rationales[id]` is a short plain-text explanation, useful in reports but
|
|
417
|
-
* never used for gating.
|
|
418
|
-
*/
|
|
419
|
-
export interface JudgeSample {
|
|
420
|
-
scores: Record<string, number>;
|
|
421
|
-
rationales: Record<string, string>;
|
|
422
|
-
}
|
|
423
|
-
/** Aggregated judge output across N samples, per rubric check. */
|
|
424
|
-
export interface JudgeAggregate {
|
|
425
|
-
checkId: string;
|
|
426
|
-
samples: number[];
|
|
427
|
-
median: number;
|
|
428
|
-
mean: number;
|
|
429
|
-
/** True iff every sample returned a score for this check. */
|
|
430
|
-
coverage: boolean;
|
|
431
|
-
}
|
|
432
|
-
/**
|
|
433
|
-
* Judge invocation result. Produced by `runJudge` and consumed by the
|
|
434
|
-
* runner: the runner converts each aggregate into a `VerifierResult` and
|
|
435
|
-
* records `usageUsd` toward the per-case cost.
|
|
436
|
-
*/
|
|
437
|
-
export interface JudgeInvocation {
|
|
438
|
-
rubricId: string;
|
|
439
|
-
samples: JudgeSample[];
|
|
440
|
-
aggregates: JudgeAggregate[];
|
|
441
|
-
usageUsd: number;
|
|
442
|
-
durationMs: number;
|
|
443
|
-
}
|
|
444
|
-
/**
|
|
445
|
-
* Tool-use summary produced by the with-tools agent loop. Captured so
|
|
446
|
-
* the runner can surface per-case tool metrics in the markdown report
|
|
447
|
-
* (number of calls, depth, error rate, denied paths).
|
|
448
|
-
*/
|
|
449
|
-
export interface ToolUseSummary {
|
|
450
|
-
/** Turns consumed before the agent produced a terminal assistant message. */
|
|
451
|
-
turns: number;
|
|
452
|
-
/** Total successful tool invocations across all turns. */
|
|
453
|
-
calls: number;
|
|
454
|
-
/** Tool invocations that returned an error (bad args, denied path, etc.). */
|
|
455
|
-
errors: number;
|
|
456
|
-
/** Paths the sandbox refused to resolve (escape attempts, missing files). */
|
|
457
|
-
deniedPaths: string[];
|
|
458
|
-
/** Per-tool call counts, keyed by tool name. */
|
|
459
|
-
byTool: Record<string, number>;
|
|
460
|
-
}
|
|
461
|
-
/**
|
|
462
|
-
* Cross-stage consistency expectations for a workflow-mode case. Every
|
|
463
|
-
* sub-check is optional so authors can opt in incrementally; an empty
|
|
464
|
-
* block produces zero verifier results.
|
|
465
|
-
*/
|
|
466
|
-
export interface WorkflowConsistencyExpected {
|
|
467
|
-
/**
|
|
468
|
-
* For each rule, every id extracted from the `from` stage must appear in
|
|
469
|
-
* every listed `to` stage. Typical entry: `{ idPattern: "D-\\d+", from:
|
|
470
|
-
* "scope", to: ["plan"] }`. Guards the "decisions flow downstream" rule.
|
|
471
|
-
*/
|
|
472
|
-
idsFlow?: Array<{
|
|
473
|
-
idPattern: string;
|
|
474
|
-
idFlags?: string;
|
|
475
|
-
from: WorkflowStageName;
|
|
476
|
-
to: WorkflowStageName[];
|
|
477
|
-
}>;
|
|
478
|
-
/**
|
|
479
|
-
* Stages that must not contain any of the listed case-insensitive
|
|
480
|
-
* phrases. Defaults to `["TBD", "TODO", "placeholder"]` when set to an
|
|
481
|
-
* empty array; omit entirely to skip the check.
|
|
482
|
-
*/
|
|
483
|
-
placeholderFree?: {
|
|
484
|
-
stages: WorkflowStageName[];
|
|
485
|
-
phrases?: string[];
|
|
486
|
-
};
|
|
487
|
-
/**
|
|
488
|
-
* Free-form substring pairs: for every entry, if `must` appears in the
|
|
489
|
-
* named stage, `forbid` must NOT appear anywhere in the listed
|
|
490
|
-
* `stages`. Useful for "v1 decided in scope, plan must not say v2".
|
|
491
|
-
*/
|
|
492
|
-
noContradictions?: Array<{
|
|
493
|
-
stage: WorkflowStageName;
|
|
494
|
-
must: string;
|
|
495
|
-
forbid: string;
|
|
496
|
-
stages: WorkflowStageName[];
|
|
497
|
-
}>;
|
|
498
|
-
}
|
|
499
|
-
/**
|
|
500
|
-
* A single stage step inside a workflow-mode case. The stage's
|
|
501
|
-
* `inputPrompt` is handed to the with-tools agent loop with prior-stage
|
|
502
|
-
* artifacts seeded into the sandbox under `stages/<name>.md`.
|
|
503
|
-
*/
|
|
504
|
-
export interface WorkflowStageStep {
|
|
505
|
-
name: WorkflowStageName;
|
|
506
|
-
inputPrompt: string;
|
|
507
|
-
/** Per-stage rubric id override (defaults to the stage name). */
|
|
508
|
-
rubric?: string;
|
|
509
|
-
/** Per-stage required rubric check ids (mirror of JudgeExpected.requiredChecks). */
|
|
510
|
-
requiredChecks?: string[];
|
|
511
|
-
/** Per-stage minimum rubric scores (mirror of JudgeExpected.minimumScores). */
|
|
512
|
-
minimumScores?: Record<string, number>;
|
|
513
|
-
}
|
|
514
|
-
/**
|
|
515
|
-
* Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
|
|
516
|
-
* the workflow mode covers the early "design" arc of a project. TDD/review/ship
|
|
517
|
-
* are out of scope (they require real code execution).
|
|
518
|
-
*/
|
|
519
|
-
export declare const WORKFLOW_STAGES: readonly ["brainstorm", "scope", "design", "spec", "plan"];
|
|
520
|
-
export type WorkflowStageName = (typeof WORKFLOW_STAGES)[number];
|
|
521
|
-
/**
|
|
522
|
-
* A workflow-mode case. Lives under
|
|
523
|
-
* `.cclaw/evals/corpus/workflows/<id>.yaml` and wires a multi-stage run
|
|
524
|
-
* through the with-tools agent.
|
|
525
|
-
*/
|
|
526
|
-
export interface WorkflowCase {
|
|
527
|
-
id: string;
|
|
528
|
-
/** Short human-readable description (rendered in reports). */
|
|
529
|
-
description?: string;
|
|
530
|
-
/** Project files seeded into the sandbox before stage 1 runs. */
|
|
531
|
-
contextFiles?: string[];
|
|
532
|
-
/** Ordered list of stages to run. Must be non-empty. */
|
|
533
|
-
stages: WorkflowStageStep[];
|
|
534
|
-
/** Cross-stage consistency checks (workflow-mode verifier family). */
|
|
535
|
-
consistency?: WorkflowConsistencyExpected;
|
|
536
|
-
}
|
|
537
|
-
/** Per-stage record inside a workflow-mode run. */
|
|
538
|
-
export interface WorkflowStageResult {
|
|
539
|
-
stage: WorkflowStageName;
|
|
540
|
-
artifact: string;
|
|
541
|
-
durationMs: number;
|
|
542
|
-
usageUsd: number;
|
|
543
|
-
toolUse: ToolUseSummary;
|
|
544
|
-
attempts: number;
|
|
545
|
-
model: string;
|
|
546
|
-
promptTokens: number;
|
|
547
|
-
completionTokens: number;
|
|
548
|
-
/** True when the judge (when requested) produced `ok:true` for every required check. */
|
|
549
|
-
judgeOk?: boolean;
|
|
550
|
-
/** Per-rubric-check medians keyed by check id (for the report). */
|
|
551
|
-
judgeMedians?: Record<string, number>;
|
|
552
|
-
}
|
|
553
|
-
/** Workflow-mode orchestration output collected by the runner. */
|
|
554
|
-
export interface WorkflowRunSummary {
|
|
555
|
-
caseId: string;
|
|
556
|
-
stages: WorkflowStageResult[];
|
|
557
|
-
totalUsageUsd: number;
|
|
558
|
-
totalDurationMs: number;
|
|
559
|
-
/** True when every stage judge was ok (or judge was skipped everywhere). */
|
|
560
|
-
allJudgeOk: boolean;
|
|
561
|
-
}
|
package/dist/eval/types.js
DELETED
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Evaluation mode — what the agent-under-test actually does.
|
|
3
|
-
*
|
|
4
|
-
* - `fixture` — verify an existing artifact against structural/rule/judge
|
|
5
|
-
* expectations. No LLM drafting, only verifiers (judge may still invoke
|
|
6
|
-
* the API). Cheapest mode.
|
|
7
|
-
* - `agent` — LLM drafts a single-stage artifact inside a sandbox using the
|
|
8
|
-
* function-calling loop (read_file/write_file/glob/grep). Replaces the
|
|
9
|
-
* previous single-shot path entirely.
|
|
10
|
-
* - `workflow` — LLM orchestrates the full multi-stage flow
|
|
11
|
-
* (brainstorm → scope → design → spec → plan) with threaded artifacts.
|
|
12
|
-
*
|
|
13
|
-
* Legacy `A|B|C` tier names are still accepted by the CLI/config loader with
|
|
14
|
-
* a deprecation warning — see `src/eval/mode.ts` for the mapping.
|
|
15
|
-
*/
|
|
16
|
-
export const EVAL_MODES = ["fixture", "agent", "workflow"];
|
|
17
|
-
/**
|
|
18
|
-
* Legacy tier identifier, kept so on-disk reports generated before v0.28.0
|
|
19
|
-
* keep parsing. New code should always use `EvalMode`.
|
|
20
|
-
* @deprecated use `EvalMode` + `toMode()` from `src/eval/mode.ts`.
|
|
21
|
-
*/
|
|
22
|
-
export const EVAL_TIERS = ["A", "B", "C"];
|
|
23
|
-
/**
|
|
24
|
-
* Verifier kinds, in increasing cost and decreasing determinism:
|
|
25
|
-
* structural and rules run without LLM; judge and workflow use the configured model.
|
|
26
|
-
* `consistency` is the workflow-mode cross-artifact family (deterministic but
|
|
27
|
-
* operates over multiple artifacts at once).
|
|
28
|
-
*/
|
|
29
|
-
export const VERIFIER_KINDS = [
|
|
30
|
-
"structural",
|
|
31
|
-
"rules",
|
|
32
|
-
"judge",
|
|
33
|
-
"workflow",
|
|
34
|
-
"consistency"
|
|
35
|
-
];
|
|
36
|
-
/**
|
|
37
|
-
* Supported workflow-mode stages. Deliberately a subset of `FlowStage` —
|
|
38
|
-
* the workflow mode covers the early "design" arc of a project. TDD/review/ship
|
|
39
|
-
* are out of scope (they require real code execution).
|
|
40
|
-
*/
|
|
41
|
-
export const WORKFLOW_STAGES = [
|
|
42
|
-
"brainstorm",
|
|
43
|
-
"scope",
|
|
44
|
-
"design",
|
|
45
|
-
"spec",
|
|
46
|
-
"plan"
|
|
47
|
-
];
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* LLM judge verifier — Step 3.
|
|
3
|
-
*
|
|
4
|
-
* Given an artifact and the stage's rubric, runs N judge samples (default
|
|
5
|
-
* median-of-3) against the configured LLM, aggregates the per-check
|
|
6
|
-
* scores, and returns one VerifierResult per rubric check plus one
|
|
7
|
-
* aggregate result covering the whole stage.
|
|
8
|
-
*
|
|
9
|
-
* Deterministic pieces (JSON parsing, aggregation, scoring) are kept pure
|
|
10
|
-
* so unit tests inject a stub EvalLlmClient and assert on the aggregate
|
|
11
|
-
* math without touching the network.
|
|
12
|
-
*/
|
|
13
|
-
import { type EvalLlmClient } from "../llm-client.js";
|
|
14
|
-
import type { JudgeExpected, JudgeInvocation, JudgeSample, ResolvedEvalConfig, RubricDoc, VerifierResult } from "../types.js";
|
|
15
|
-
export interface RunJudgeOptions {
|
|
16
|
-
artifact: string;
|
|
17
|
-
rubric: RubricDoc;
|
|
18
|
-
config: Pick<ResolvedEvalConfig, "model" | "judgeModel" | "judgeSamples" | "judgeTemperature" | "timeoutMs" | "tokenPricing">;
|
|
19
|
-
client: EvalLlmClient;
|
|
20
|
-
/** Per-case hint that overlays the rubric (sample count, minimums). */
|
|
21
|
-
caseHint?: JudgeExpected;
|
|
22
|
-
/** Optional seed seed; incremented per sample for reproducibility. */
|
|
23
|
-
baseSeed?: number;
|
|
24
|
-
}
|
|
25
|
-
/**
|
|
26
|
-
* Parse one judge response into a JudgeSample. The parser is intentionally
|
|
27
|
-
* forgiving with rationales (missing -> empty string) but strict with
|
|
28
|
-
* scores: missing or non-numeric entries are dropped and the coverage
|
|
29
|
-
* flag on the aggregate flips to false.
|
|
30
|
-
*/
|
|
31
|
-
export declare function parseJudgeResponse(content: string, rubric: RubricDoc): JudgeSample;
|
|
32
|
-
/** Run the judge against an artifact and return per-sample + aggregate data. */
|
|
33
|
-
export declare function runJudge(options: RunJudgeOptions): Promise<JudgeInvocation>;
|
|
34
|
-
/**
|
|
35
|
-
* Convert a JudgeInvocation into VerifierResult[] for the runner. One
|
|
36
|
-
* result per rubric check (score 0..1 normalized from the 1..5 median) +
|
|
37
|
-
* one "coverage" result that flips to `ok:false` when any sample failed
|
|
38
|
-
* to emit a score for a check.
|
|
39
|
-
*/
|
|
40
|
-
export declare function judgeResultsToVerifiers(rubric: RubricDoc, invocation: JudgeInvocation, config: Pick<ResolvedEvalConfig, "regression">, caseHint?: JudgeExpected): VerifierResult[];
|