cclaw-cli 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +4 -4
- package/dist/constants.d.ts +4 -4
- package/dist/constants.js +4 -4
- package/dist/content/eval-scaffold.d.ts +4 -4
- package/dist/content/eval-scaffold.js +13 -14
- package/dist/content/examples.js +11 -11
- package/dist/content/hooks.js +1 -1
- package/dist/content/skills.d.ts +3 -3
- package/dist/content/skills.js +19 -19
- package/dist/content/stage-schema.js +2 -2
- package/dist/content/stages/plan.js +18 -18
- package/dist/content/stages/schema-types.d.ts +2 -2
- package/dist/content/stages/tdd.js +1 -1
- package/dist/content/subagents.js +1 -1
- package/dist/content/templates.js +8 -8
- package/dist/content/utility-skills.js +19 -19
- package/dist/doctor.js +2 -2
- package/dist/eval/baseline.js +1 -1
- package/dist/eval/corpus.d.ts +12 -1
- package/dist/eval/corpus.js +163 -8
- package/dist/eval/llm-client.d.ts +10 -10
- package/dist/eval/llm-client.js +5 -5
- package/dist/eval/report.js +1 -1
- package/dist/eval/runner.d.ts +6 -6
- package/dist/eval/runner.js +83 -37
- package/dist/eval/types.d.ts +78 -13
- package/dist/eval/verifiers/rules.d.ts +24 -0
- package/dist/eval/verifiers/rules.js +218 -0
- package/dist/eval/verifiers/structural.js +3 -3
- package/dist/eval/verifiers/traceability.d.ts +23 -0
- package/dist/eval/verifiers/traceability.js +84 -0
- package/dist/install.js +3 -3
- package/dist/policy.js +1 -1
- package/package.json +1 -1
package/dist/eval/types.d.ts
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* deliberately decoupled from the main cclaw runtime so that:
|
|
7
7
|
*
|
|
8
8
|
* - Users who never run `cclaw eval` pay zero runtime cost.
|
|
9
|
-
* - The verifier / rubric / LLM stack evolves on its own release cadence (
|
|
9
|
+
* - The verifier / rubric / LLM stack evolves on its own release cadence (Steps 0-6).
|
|
10
10
|
* - Any OpenAI-compatible endpoint can be swapped in via config (z.ai, OpenAI, vLLM, etc.).
|
|
11
11
|
*/
|
|
12
12
|
import type { FlowStage } from "../types.js";
|
|
@@ -29,8 +29,8 @@ export declare const VERIFIER_KINDS: readonly ["structural", "rules", "judge", "
|
|
|
29
29
|
export type VerifierKind = (typeof VERIFIER_KINDS)[number];
|
|
30
30
|
/**
|
|
31
31
|
* Structural expectations — deterministic, LLM-free checks against a single
|
|
32
|
-
* text artifact.
|
|
33
|
-
* sibling `rules` shape,
|
|
32
|
+
* text artifact. Step 1 implements all fields below; Step 2 adds the
|
|
33
|
+
* sibling `rules` shape, Step 3 adds `judge`.
|
|
34
34
|
*/
|
|
35
35
|
export interface StructuralExpected {
|
|
36
36
|
/**
|
|
@@ -58,19 +58,77 @@ export interface StructuralExpected {
|
|
|
58
58
|
*/
|
|
59
59
|
requiredFrontmatterKeys?: string[];
|
|
60
60
|
}
|
|
61
|
-
/**
|
|
61
|
+
/**
|
|
62
|
+
* Rule-based expectations — zero-LLM content checks that are richer than
|
|
63
|
+
* structural (regex, numeric bounds, uniqueness). Introduced in Step 2.
|
|
64
|
+
*
|
|
65
|
+
* Every array field is optional; an empty `RulesExpected` produces zero
|
|
66
|
+
* verifier results so authors can enable rules incrementally.
|
|
67
|
+
*/
|
|
68
|
+
export interface RulesExpected {
|
|
69
|
+
/** Case-insensitive substrings the body must include at least once. */
|
|
70
|
+
mustContain?: string[];
|
|
71
|
+
/** Case-insensitive substrings the body must NOT include. */
|
|
72
|
+
mustNotContain?: string[];
|
|
73
|
+
/** Regex patterns that must match the body at least once. */
|
|
74
|
+
regexRequired?: RuleRegex[];
|
|
75
|
+
/** Regex patterns that must NOT match the body. */
|
|
76
|
+
regexForbidden?: RuleRegex[];
|
|
77
|
+
/** For each substring key, the body must contain at least N occurrences. */
|
|
78
|
+
minOccurrences?: Record<string, number>;
|
|
79
|
+
/** For each substring key, the body must contain at most N occurrences. */
|
|
80
|
+
maxOccurrences?: Record<string, number>;
|
|
81
|
+
/**
|
|
82
|
+
* For each named section (case-insensitive heading substring), every bullet
|
|
83
|
+
* (`- ...`) directly under the section must be unique. Catches duplicated
|
|
84
|
+
* decisions or repeated risks.
|
|
85
|
+
*/
|
|
86
|
+
uniqueBulletsInSection?: string[];
|
|
87
|
+
}
|
|
88
|
+
export interface RuleRegex {
|
|
89
|
+
/** Source of the regex. Parsed with `new RegExp(pattern, flags)`. */
|
|
90
|
+
pattern: string;
|
|
91
|
+
/** Optional regex flags; defaults to `"i"` for case-insensitive matching. */
|
|
92
|
+
flags?: string;
|
|
93
|
+
/** Human-readable label rendered in verifier messages and slugged into the id. */
|
|
94
|
+
description?: string;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Cross-stage traceability expectations — assert every ID extracted from
|
|
98
|
+
* `source` also appears in `self` and/or named `extra_fixtures`. Introduced
|
|
99
|
+
* in Step 2.
|
|
100
|
+
*/
|
|
101
|
+
export interface TraceabilityExpected {
|
|
102
|
+
/** Regex applied to the `source` fixture to collect the authoritative ID set. */
|
|
103
|
+
idPattern: string;
|
|
104
|
+
/** Optional regex flags (defaults to `"g"`). */
|
|
105
|
+
idFlags?: string;
|
|
106
|
+
/**
|
|
107
|
+
* Where to read the authoritative ID set from. Either `"self"` (the case's
|
|
108
|
+
* primary `fixture`) or a label present in the case's `extraFixtures` map.
|
|
109
|
+
*/
|
|
110
|
+
source: string;
|
|
111
|
+
/**
|
|
112
|
+
* Where every source ID must also appear. Each entry is `"self"` or an
|
|
113
|
+
* `extraFixtures` label. Order is preserved for deterministic result ids.
|
|
114
|
+
*/
|
|
115
|
+
requireIn: string[];
|
|
116
|
+
}
|
|
117
|
+
/** Superset of per-verifier expectation shapes. */
|
|
62
118
|
export interface ExpectedShape {
|
|
63
119
|
structural?: StructuralExpected;
|
|
64
|
-
/** Rule-based (keyword/regex/
|
|
65
|
-
rules?:
|
|
66
|
-
/**
|
|
120
|
+
/** Rule-based (keyword/regex/count/uniqueness) checks — Step 2. */
|
|
121
|
+
rules?: RulesExpected;
|
|
122
|
+
/** Cross-stage ID propagation checks — Step 2. */
|
|
123
|
+
traceability?: TraceabilityExpected;
|
|
124
|
+
/** LLM-judge rubrics — Step 3. */
|
|
67
125
|
judge?: Record<string, unknown>;
|
|
68
126
|
}
|
|
69
127
|
/**
|
|
70
128
|
* A single eval case describes one input scenario for one stage. Cases live in
|
|
71
129
|
* `.cclaw/evals/corpus/<stage>/<id>.yaml` and may reference a pre-generated
|
|
72
|
-
* fixture artifact for verifier development (
|
|
73
|
-
* exists (
|
|
130
|
+
* fixture artifact for verifier development (Step 1) before the agent loop
|
|
131
|
+
* exists (Step 3+).
|
|
74
132
|
*/
|
|
75
133
|
export interface EvalCase {
|
|
76
134
|
id: string;
|
|
@@ -85,10 +143,17 @@ export interface EvalCase {
|
|
|
85
143
|
expected?: ExpectedShape;
|
|
86
144
|
/**
|
|
87
145
|
* Path (relative to the corpus case file) of a pre-generated artifact used
|
|
88
|
-
* when verifiers are exercised without a live agent loop. Primarily a
|
|
89
|
-
*
|
|
146
|
+
* when verifiers are exercised without a live agent loop. Primarily a
|
|
147
|
+
* Step 1 development aid.
|
|
90
148
|
*/
|
|
91
149
|
fixture?: string;
|
|
150
|
+
/**
|
|
151
|
+
* Additional fixture paths loaded alongside the primary `fixture`, keyed
|
|
152
|
+
* by a free-form label. Consumed by cross-artifact verifiers (e.g.,
|
|
153
|
+
* traceability) introduced in Step 2. Paths are resolved relative to the
|
|
154
|
+
* case's stage directory, just like `fixture`.
|
|
155
|
+
*/
|
|
156
|
+
extraFixtures?: Record<string, string>;
|
|
92
157
|
}
|
|
93
158
|
/** Result of one verifier applied to one case. */
|
|
94
159
|
export interface VerifierResult {
|
|
@@ -129,7 +194,7 @@ export interface EvalReport {
|
|
|
129
194
|
totalCostUsd: number;
|
|
130
195
|
totalDurationMs: number;
|
|
131
196
|
};
|
|
132
|
-
/** Present when comparing against a saved baseline (
|
|
197
|
+
/** Present when comparing against a saved baseline (Step 1+). */
|
|
133
198
|
baselineDelta?: BaselineDelta;
|
|
134
199
|
}
|
|
135
200
|
/**
|
|
@@ -170,7 +235,7 @@ export interface ResolvedEvalConfig extends EvalConfig {
|
|
|
170
235
|
source: "default" | "file" | "env" | "file+env";
|
|
171
236
|
}
|
|
172
237
|
/**
|
|
173
|
-
* Frozen per-stage baseline used by regression gating (
|
|
238
|
+
* Frozen per-stage baseline used by regression gating (Step 1). Baselines
|
|
174
239
|
* are committed to git; `cclaw eval --update-baseline --confirm` rewrites
|
|
175
240
|
* them. The shape is intentionally flat so a quick `git diff` reveals what
|
|
176
241
|
* changed between runs.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Rule-based verifier: deterministic, zero-LLM checks that are richer than
|
|
3
|
+
* structural heading/length assertions. Each rule produces exactly one
|
|
4
|
+
* `VerifierResult` so baselines diff at the check level, and authoring a
|
|
5
|
+
* rule sideways in YAML never silently skips.
|
|
6
|
+
*
|
|
7
|
+
* Semantics:
|
|
8
|
+
*
|
|
9
|
+
* - All substring matching is case-insensitive. Regex matching uses the
|
|
10
|
+
* flags declared on the rule (default `"i"`).
|
|
11
|
+
* - Rules operate on the artifact BODY (frontmatter stripped), mirroring
|
|
12
|
+
* the structural verifier so min/max counts and length checks agree on
|
|
13
|
+
* what "body" means.
|
|
14
|
+
* - `uniqueBulletsInSection` scans every section (heading, case-insensitive
|
|
15
|
+
* substring match) and flags duplicate top-level bullets ("- item"). The
|
|
16
|
+
* search stops at the next heading of equal or lower depth.
|
|
17
|
+
*/
|
|
18
|
+
import type { RulesExpected, VerifierResult } from "../types.js";
|
|
19
|
+
/**
|
|
20
|
+
* Run every configured rule check against the artifact body. Returns `[]`
|
|
21
|
+
* when `expected` is undefined or empty so the runner can distinguish
|
|
22
|
+
* "no rules declared" from "all rules passed".
|
|
23
|
+
*/
|
|
24
|
+
export declare function verifyRules(artifact: string, expected: RulesExpected | undefined): VerifierResult[];
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import { splitFrontmatter } from "./structural.js";
|
|
2
|
+
function slugify(input) {
|
|
3
|
+
return (input
|
|
4
|
+
.toLowerCase()
|
|
5
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
6
|
+
.replace(/(^-|-$)/g, "")
|
|
7
|
+
.slice(0, 64) || "rule");
|
|
8
|
+
}
|
|
9
|
+
function result(id, ok, message, details) {
|
|
10
|
+
return {
|
|
11
|
+
kind: "rules",
|
|
12
|
+
id,
|
|
13
|
+
ok,
|
|
14
|
+
score: ok ? 1 : 0,
|
|
15
|
+
message,
|
|
16
|
+
...(details !== undefined ? { details } : {})
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
function countOccurrences(haystack, needle) {
|
|
20
|
+
if (needle.length === 0)
|
|
21
|
+
return 0;
|
|
22
|
+
let index = 0;
|
|
23
|
+
let count = 0;
|
|
24
|
+
while (true) {
|
|
25
|
+
const at = haystack.indexOf(needle, index);
|
|
26
|
+
if (at < 0)
|
|
27
|
+
return count;
|
|
28
|
+
count += 1;
|
|
29
|
+
index = at + needle.length;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
function compileRegex(rule) {
|
|
33
|
+
const flags = rule.flags ?? "i";
|
|
34
|
+
try {
|
|
35
|
+
return new RegExp(rule.pattern, flags);
|
|
36
|
+
}
|
|
37
|
+
catch (err) {
|
|
38
|
+
throw new Error(`Invalid regex for rule "${rule.description ?? rule.pattern}" ` +
|
|
39
|
+
`(pattern=${JSON.stringify(rule.pattern)}, flags=${JSON.stringify(flags)}): ` +
|
|
40
|
+
(err instanceof Error ? err.message : String(err)));
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
function ruleLabel(rule) {
|
|
44
|
+
return rule.description?.trim() || rule.pattern;
|
|
45
|
+
}
|
|
46
|
+
function checkMustContain(needles, body) {
|
|
47
|
+
const bodyLower = body.toLowerCase();
|
|
48
|
+
return needles.map((needle) => {
|
|
49
|
+
const found = bodyLower.includes(needle.toLowerCase());
|
|
50
|
+
return result(`rules:contains:${slugify(needle)}`, found, found
|
|
51
|
+
? `Required phrase "${needle}" present.`
|
|
52
|
+
: `Required phrase "${needle}" missing from body.`, { phrase: needle });
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
function checkMustNotContain(needles, body) {
|
|
56
|
+
const bodyLower = body.toLowerCase();
|
|
57
|
+
return needles.map((needle) => {
|
|
58
|
+
const lowered = needle.toLowerCase();
|
|
59
|
+
const occurrences = countOccurrences(bodyLower, lowered);
|
|
60
|
+
const ok = occurrences === 0;
|
|
61
|
+
return result(`rules:not-contains:${slugify(needle)}`, ok, ok
|
|
62
|
+
? `Forbidden phrase "${needle}" absent (as required).`
|
|
63
|
+
: `Forbidden phrase "${needle}" appears ${occurrences} time(s).`, { phrase: needle, occurrences });
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
function checkRegexRequired(rules, body) {
|
|
67
|
+
return rules.map((rule) => {
|
|
68
|
+
const label = ruleLabel(rule);
|
|
69
|
+
const regex = compileRegex(rule);
|
|
70
|
+
const matches = body.match(new RegExp(regex.source, withGlobal(regex.flags)));
|
|
71
|
+
const count = matches ? matches.length : 0;
|
|
72
|
+
const ok = count > 0;
|
|
73
|
+
return result(`rules:regex-required:${slugify(label)}`, ok, ok
|
|
74
|
+
? `Required pattern /${rule.pattern}/ matched ${count} time(s).`
|
|
75
|
+
: `Required pattern /${rule.pattern}/ did not match.`, { pattern: rule.pattern, flags: rule.flags ?? "i", matches: count });
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
function checkRegexForbidden(rules, body) {
|
|
79
|
+
return rules.map((rule) => {
|
|
80
|
+
const label = ruleLabel(rule);
|
|
81
|
+
const regex = compileRegex(rule);
|
|
82
|
+
const matches = body.match(new RegExp(regex.source, withGlobal(regex.flags)));
|
|
83
|
+
const count = matches ? matches.length : 0;
|
|
84
|
+
const ok = count === 0;
|
|
85
|
+
return result(`rules:regex-forbidden:${slugify(label)}`, ok, ok
|
|
86
|
+
? `Forbidden pattern /${rule.pattern}/ absent.`
|
|
87
|
+
: `Forbidden pattern /${rule.pattern}/ matched ${count} time(s).`, { pattern: rule.pattern, flags: rule.flags ?? "i", matches: count });
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
function withGlobal(flags) {
|
|
91
|
+
return flags.includes("g") ? flags : `${flags}g`;
|
|
92
|
+
}
|
|
93
|
+
function checkMinOccurrences(bounds, body) {
|
|
94
|
+
const bodyLower = body.toLowerCase();
|
|
95
|
+
return Object.entries(bounds).map(([needle, min]) => {
|
|
96
|
+
const occurrences = countOccurrences(bodyLower, needle.toLowerCase());
|
|
97
|
+
const ok = occurrences >= min;
|
|
98
|
+
return result(`rules:min-occurrences:${slugify(needle)}`, ok, ok
|
|
99
|
+
? `Phrase "${needle}" appears ${occurrences} time(s) (>= ${min}).`
|
|
100
|
+
: `Phrase "${needle}" appears ${occurrences} time(s); expected at least ${min}.`, { phrase: needle, occurrences, min });
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
function checkMaxOccurrences(bounds, body) {
|
|
104
|
+
const bodyLower = body.toLowerCase();
|
|
105
|
+
return Object.entries(bounds).map(([needle, max]) => {
|
|
106
|
+
const occurrences = countOccurrences(bodyLower, needle.toLowerCase());
|
|
107
|
+
const ok = occurrences <= max;
|
|
108
|
+
return result(`rules:max-occurrences:${slugify(needle)}`, ok, ok
|
|
109
|
+
? `Phrase "${needle}" appears ${occurrences} time(s) (<= ${max}).`
|
|
110
|
+
: `Phrase "${needle}" appears ${occurrences} time(s); expected at most ${max}.`, { phrase: needle, occurrences, max });
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
function sliceBySection(body) {
|
|
114
|
+
const lines = body.split(/\r?\n/);
|
|
115
|
+
const slices = [];
|
|
116
|
+
let current = null;
|
|
117
|
+
for (const rawLine of lines) {
|
|
118
|
+
const line = rawLine.trimStart();
|
|
119
|
+
const match = line.match(/^(#{1,6})\s+(.+?)\s*$/);
|
|
120
|
+
if (match) {
|
|
121
|
+
if (current) {
|
|
122
|
+
slices.push({
|
|
123
|
+
heading: current.heading,
|
|
124
|
+
depth: current.depth,
|
|
125
|
+
body: current.body.join("\n")
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
current = { heading: match[2].trim(), depth: match[1].length, body: [] };
|
|
129
|
+
}
|
|
130
|
+
else if (current) {
|
|
131
|
+
current.body.push(rawLine);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
if (current) {
|
|
135
|
+
slices.push({
|
|
136
|
+
heading: current.heading,
|
|
137
|
+
depth: current.depth,
|
|
138
|
+
body: current.body.join("\n")
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
return slices;
|
|
142
|
+
}
|
|
143
|
+
function extractTopLevelBullets(sectionBody) {
|
|
144
|
+
const bullets = [];
|
|
145
|
+
for (const rawLine of sectionBody.split(/\r?\n/)) {
|
|
146
|
+
const line = rawLine.replace(/\s+$/, "");
|
|
147
|
+
const leading = line.match(/^(\s*)[-*]\s+(.+)$/);
|
|
148
|
+
if (!leading)
|
|
149
|
+
continue;
|
|
150
|
+
if (leading[1].length > 0)
|
|
151
|
+
continue;
|
|
152
|
+
bullets.push(leading[2].trim());
|
|
153
|
+
}
|
|
154
|
+
return bullets;
|
|
155
|
+
}
|
|
156
|
+
function checkUniqueBulletsInSection(sections, body) {
|
|
157
|
+
const slices = sliceBySection(body);
|
|
158
|
+
return sections.map((needle) => {
|
|
159
|
+
const lowerNeedle = needle.toLowerCase();
|
|
160
|
+
const slice = slices.find((s) => s.heading.toLowerCase().includes(lowerNeedle));
|
|
161
|
+
if (!slice) {
|
|
162
|
+
return result(`rules:unique-in-section:${slugify(needle)}`, false, `Section matching "${needle}" not found; cannot check uniqueness.`, { section: needle, found: false });
|
|
163
|
+
}
|
|
164
|
+
const bullets = extractTopLevelBullets(slice.body);
|
|
165
|
+
const seen = new Map();
|
|
166
|
+
for (const bullet of bullets) {
|
|
167
|
+
const key = bullet.toLowerCase();
|
|
168
|
+
seen.set(key, (seen.get(key) ?? 0) + 1);
|
|
169
|
+
}
|
|
170
|
+
const duplicates = [...seen.entries()]
|
|
171
|
+
.filter(([, count]) => count > 1)
|
|
172
|
+
.map(([entry, count]) => ({ entry, count }));
|
|
173
|
+
const ok = duplicates.length === 0;
|
|
174
|
+
return result(`rules:unique-in-section:${slugify(needle)}`, ok, ok
|
|
175
|
+
? `Section "${slice.heading}" has ${bullets.length} unique bullet(s).`
|
|
176
|
+
: `Section "${slice.heading}" has duplicate bullet(s): ${duplicates
|
|
177
|
+
.map((d) => `"${d.entry}" x${d.count}`)
|
|
178
|
+
.join(", ")}.`, {
|
|
179
|
+
section: slice.heading,
|
|
180
|
+
bullets: bullets.length,
|
|
181
|
+
duplicates
|
|
182
|
+
});
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* Run every configured rule check against the artifact body. Returns `[]`
|
|
187
|
+
* when `expected` is undefined or empty so the runner can distinguish
|
|
188
|
+
* "no rules declared" from "all rules passed".
|
|
189
|
+
*/
|
|
190
|
+
export function verifyRules(artifact, expected) {
|
|
191
|
+
if (!expected)
|
|
192
|
+
return [];
|
|
193
|
+
const split = splitFrontmatter(artifact);
|
|
194
|
+
const body = split.body;
|
|
195
|
+
const results = [];
|
|
196
|
+
if (expected.mustContain?.length) {
|
|
197
|
+
results.push(...checkMustContain(expected.mustContain, body));
|
|
198
|
+
}
|
|
199
|
+
if (expected.mustNotContain?.length) {
|
|
200
|
+
results.push(...checkMustNotContain(expected.mustNotContain, body));
|
|
201
|
+
}
|
|
202
|
+
if (expected.regexRequired?.length) {
|
|
203
|
+
results.push(...checkRegexRequired(expected.regexRequired, body));
|
|
204
|
+
}
|
|
205
|
+
if (expected.regexForbidden?.length) {
|
|
206
|
+
results.push(...checkRegexForbidden(expected.regexForbidden, body));
|
|
207
|
+
}
|
|
208
|
+
if (expected.minOccurrences && Object.keys(expected.minOccurrences).length) {
|
|
209
|
+
results.push(...checkMinOccurrences(expected.minOccurrences, body));
|
|
210
|
+
}
|
|
211
|
+
if (expected.maxOccurrences && Object.keys(expected.maxOccurrences).length) {
|
|
212
|
+
results.push(...checkMaxOccurrences(expected.maxOccurrences, body));
|
|
213
|
+
}
|
|
214
|
+
if (expected.uniqueBulletsInSection?.length) {
|
|
215
|
+
results.push(...checkUniqueBulletsInSection(expected.uniqueBulletsInSection, body));
|
|
216
|
+
}
|
|
217
|
+
return results;
|
|
218
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Structural verifier
|
|
2
|
+
* Structural verifier: deterministic, zero-LLM checks against a
|
|
3
3
|
* single markdown artifact. Each structural expectation produces one
|
|
4
4
|
* `VerifierResult` so baselines diff cleanly at the check level rather than
|
|
5
5
|
* lumping everything into a single boolean.
|
|
@@ -15,8 +15,8 @@
|
|
|
15
15
|
* - `minLines`/`maxLines` intentionally exclude frontmatter so a rewrite that
|
|
16
16
|
* adds metadata does not accidentally drop the body below the floor.
|
|
17
17
|
* - Scoring: each check scores 0 or 1. The case `passed` becomes the AND of
|
|
18
|
-
* all individual `ok` flags. This keeps
|
|
19
|
-
* rubric scale shows up in
|
|
18
|
+
* all individual `ok` flags. This keeps the structural verifier
|
|
19
|
+
* deterministic; the 0..1 rubric scale shows up later in the LLM judge.
|
|
20
20
|
*/
|
|
21
21
|
import { parse as parseYaml } from "yaml";
|
|
22
22
|
const FRONTMATTER_OPEN = /^---\r?\n/;
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-stage traceability verifier: extract a set of IDs from a source
|
|
3
|
+
* fixture (e.g. `D-\d+` decisions declared during scope) and assert every
|
|
4
|
+
* ID appears in the artifact-under-test and/or in other linked fixtures.
|
|
5
|
+
*
|
|
6
|
+
* The verifier is intentionally source-agnostic: the caller passes the
|
|
7
|
+
* primary artifact plus a label → text map for any extra fixtures declared
|
|
8
|
+
* on the case. `source` and entries in `requireIn` are either the string
|
|
9
|
+
* `"self"` (the primary artifact) or labels present in the extras map.
|
|
10
|
+
*
|
|
11
|
+
* Result ids follow `traceability:<source>->:<target>:<reason>` so baselines
|
|
12
|
+
* diff at the per-link granularity. A missing link produces one result with
|
|
13
|
+
* a list of missing IDs in its `details` payload.
|
|
14
|
+
*/
|
|
15
|
+
import type { TraceabilityExpected, VerifierResult } from "../types.js";
|
|
16
|
+
export declare const SELF_LABEL = "self";
|
|
17
|
+
/**
|
|
18
|
+
* Run traceability checks. Returns `[]` when expectations are undefined.
|
|
19
|
+
* Emits a single "source-missing" result when the declared source fixture
|
|
20
|
+
* has zero IDs (authoring error), and one result per `requireIn` target
|
|
21
|
+
* listing any IDs absent in that fixture.
|
|
22
|
+
*/
|
|
23
|
+
export declare function verifyTraceability(primaryArtifact: string, extraFixtures: Record<string, string>, expected: TraceabilityExpected | undefined): VerifierResult[];
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { splitFrontmatter } from "./structural.js";
|
|
2
|
+
export const SELF_LABEL = "self";
|
|
3
|
+
function result(id, ok, message, details) {
|
|
4
|
+
return {
|
|
5
|
+
kind: "rules",
|
|
6
|
+
id,
|
|
7
|
+
ok,
|
|
8
|
+
score: ok ? 1 : 0,
|
|
9
|
+
message,
|
|
10
|
+
...(details !== undefined ? { details } : {})
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
function compileIdRegex(expected) {
|
|
14
|
+
const flags = expected.idFlags ?? "g";
|
|
15
|
+
const normalized = flags.includes("g") ? flags : `${flags}g`;
|
|
16
|
+
try {
|
|
17
|
+
return new RegExp(expected.idPattern, normalized);
|
|
18
|
+
}
|
|
19
|
+
catch (err) {
|
|
20
|
+
throw new Error(`Invalid traceability id_pattern ${JSON.stringify(expected.idPattern)} ` +
|
|
21
|
+
`(flags=${JSON.stringify(normalized)}): ` +
|
|
22
|
+
(err instanceof Error ? err.message : String(err)));
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
function bodyOf(text) {
|
|
26
|
+
return splitFrontmatter(text).body;
|
|
27
|
+
}
|
|
28
|
+
function extractIds(text, regex) {
|
|
29
|
+
const body = bodyOf(text);
|
|
30
|
+
const found = new Set();
|
|
31
|
+
for (const match of body.matchAll(regex)) {
|
|
32
|
+
found.add(match[0]);
|
|
33
|
+
}
|
|
34
|
+
return [...found].sort();
|
|
35
|
+
}
|
|
36
|
+
function resolveFixture(label, primary, extraFixtures) {
|
|
37
|
+
if (label === SELF_LABEL)
|
|
38
|
+
return primary;
|
|
39
|
+
return extraFixtures[label];
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Run traceability checks. Returns `[]` when expectations are undefined.
|
|
43
|
+
* Emits a single "source-missing" result when the declared source fixture
|
|
44
|
+
* has zero IDs (authoring error), and one result per `requireIn` target
|
|
45
|
+
* listing any IDs absent in that fixture.
|
|
46
|
+
*/
|
|
47
|
+
export function verifyTraceability(primaryArtifact, extraFixtures, expected) {
|
|
48
|
+
if (!expected)
|
|
49
|
+
return [];
|
|
50
|
+
const regex = compileIdRegex(expected);
|
|
51
|
+
const sourceText = resolveFixture(expected.source, primaryArtifact, extraFixtures);
|
|
52
|
+
if (sourceText === undefined) {
|
|
53
|
+
return [
|
|
54
|
+
result(`traceability:source:${expected.source}:missing`, false, `Traceability source fixture "${expected.source}" not loaded.`, { source: expected.source })
|
|
55
|
+
];
|
|
56
|
+
}
|
|
57
|
+
const sourceIds = extractIds(sourceText, regex);
|
|
58
|
+
if (sourceIds.length === 0) {
|
|
59
|
+
return [
|
|
60
|
+
result(`traceability:source:${expected.source}:empty`, false, `Source "${expected.source}" yielded zero ids for pattern /${expected.idPattern}/.`, { source: expected.source, pattern: expected.idPattern })
|
|
61
|
+
];
|
|
62
|
+
}
|
|
63
|
+
const results = [];
|
|
64
|
+
for (const target of expected.requireIn) {
|
|
65
|
+
const targetText = resolveFixture(target, primaryArtifact, extraFixtures);
|
|
66
|
+
if (targetText === undefined) {
|
|
67
|
+
results.push(result(`traceability:target:${target}:missing`, false, `Traceability target fixture "${target}" not loaded.`, { target }));
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
const targetBody = bodyOf(targetText);
|
|
71
|
+
const missing = sourceIds.filter((id) => !targetBody.includes(id));
|
|
72
|
+
const ok = missing.length === 0;
|
|
73
|
+
results.push(result(`traceability:${expected.source}->${target}`, ok, ok
|
|
74
|
+
? `Every id (${sourceIds.length}) from "${expected.source}" appears in "${target}".`
|
|
75
|
+
: `Target "${target}" is missing ${missing.length}/${sourceIds.length} id(s): ${missing.join(", ")}.`, {
|
|
76
|
+
source: expected.source,
|
|
77
|
+
target,
|
|
78
|
+
sourceIds,
|
|
79
|
+
missing,
|
|
80
|
+
pattern: expected.idPattern
|
|
81
|
+
}));
|
|
82
|
+
}
|
|
83
|
+
return results;
|
|
84
|
+
}
|
package/dist/install.js
CHANGED
|
@@ -29,7 +29,7 @@ import { META_SKILL_NAME, usingCclawSkillMarkdown } from "./content/meta-skill.j
|
|
|
29
29
|
import { decisionProtocolMarkdown, completionProtocolMarkdown, ethosProtocolMarkdown } from "./content/protocols.js";
|
|
30
30
|
import { ARTIFACT_TEMPLATES, CURSOR_WORKFLOW_RULE_MDC, RULEBOOK_MARKDOWN, buildRulesJson } from "./content/templates.js";
|
|
31
31
|
import { EVAL_BASELINES_README, EVAL_CONFIG_YAML, EVAL_CORPUS_README, EVAL_REPORTS_README, EVAL_RUBRICS_README } from "./content/eval-scaffold.js";
|
|
32
|
-
import {
|
|
32
|
+
import { TDD_BATCH_WALKTHROUGH_MARKDOWN, stageSkillFolder, stageSkillMarkdown } from "./content/skills.js";
|
|
33
33
|
import { stageCommonGuidanceMarkdown } from "./content/stage-common-guidance.js";
|
|
34
34
|
import { STAGE_EXAMPLES_REFERENCE_DIR, stageExamplesReferenceMarkdown } from "./content/examples.js";
|
|
35
35
|
import { LANGUAGE_RULE_PACK_DIR, LANGUAGE_RULE_PACK_FILES, LANGUAGE_RULE_PACK_GENERATORS, LEGACY_LANGUAGE_RULE_PACK_FOLDERS, UTILITY_SKILL_FOLDERS, UTILITY_SKILL_MAP } from "./content/utility-skills.js";
|
|
@@ -218,11 +218,11 @@ async function writeSkills(projectRoot, config) {
|
|
|
218
218
|
await writeFileSafe(runtimePath(projectRoot, ...referenceDir, `${stage}-examples.md`), referenceMarkdown);
|
|
219
219
|
}
|
|
220
220
|
}
|
|
221
|
-
// Progressive disclosure for the TDD
|
|
221
|
+
// Progressive disclosure for the TDD Batch Execution walkthrough (A.1#1).
|
|
222
222
|
// The detailed 3-task transcript lives next to stage examples so the
|
|
223
223
|
// always-rendered TDD skill stays under the line-budget and the reference
|
|
224
224
|
// is loaded on demand.
|
|
225
|
-
await writeFileSafe(runtimePath(projectRoot, ...STAGE_EXAMPLES_REFERENCE_DIR.split("/"), "tdd-
|
|
225
|
+
await writeFileSafe(runtimePath(projectRoot, ...STAGE_EXAMPLES_REFERENCE_DIR.split("/"), "tdd-batch-walkthrough.md"), TDD_BATCH_WALKTHROUGH_MARKDOWN);
|
|
226
226
|
await writeFileSafe(runtimePath(projectRoot, ...STAGE_EXAMPLES_REFERENCE_DIR.split("/"), "common-guidance.md"), stageCommonGuidanceMarkdown());
|
|
227
227
|
// Utility skills (not flow stages)
|
|
228
228
|
await writeFileSafe(runtimePath(projectRoot, "skills", "learnings", "SKILL.md"), learnSkillMarkdown());
|
package/dist/policy.js
CHANGED
|
@@ -161,7 +161,7 @@ export async function policyChecks(projectRoot, options = {}) {
|
|
|
161
161
|
{ file: runtimeFile("skills/docs/SKILL.md"), needle: "## README Guidance", name: "utility_skill:docs:readme" },
|
|
162
162
|
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "## HARD-GATE", name: "utility_skill:executing_plans:hard_gate" },
|
|
163
163
|
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "## Execution Protocol", name: "utility_skill:executing_plans:protocol" },
|
|
164
|
-
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "##
|
|
164
|
+
{ file: runtimeFile("skills/executing-plans/SKILL.md"), needle: "## Batch Checklist", name: "utility_skill:executing_plans:batches" },
|
|
165
165
|
{ file: runtimeFile("skills/verification-before-completion/SKILL.md"), needle: "## HARD-GATE", name: "utility_skill:verification_before_completion:hard_gate" },
|
|
166
166
|
{ file: runtimeFile("skills/verification-before-completion/SKILL.md"), needle: "## Protocol", name: "utility_skill:verification_before_completion:protocol" },
|
|
167
167
|
{ file: runtimeFile("skills/finishing-a-development-branch/SKILL.md"), needle: "## HARD-GATE", name: "utility_skill:finishing_branch:hard_gate" },
|