kc-beta 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "kc-beta",
3
- "version": "0.7.0",
3
+ "version": "0.7.2",
4
4
  "description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -21,6 +21,7 @@ import { SnapshotTool } from "./tools/snapshot.js";
21
21
  import { ArchiveFileTool } from "./tools/archive-file.js";
22
22
  import { ScheduleFetchTool } from "./tools/schedule-fetch.js";
23
23
  import { ReleaseTool } from "./tools/release.js";
24
+ import { readKcVersion } from "../util/kc-version.js";
24
25
  import { PhaseAdvanceTool } from "./tools/phase-advance.js";
25
26
  import { DocumentParseTool } from "./tools/document-parse.js";
26
27
  import { DocumentSearchTool } from "./tools/document-search.js";
@@ -421,9 +422,23 @@ export class AgentEngine {
421
422
  new SnapshotTool(this.workspace),
422
423
  new ArchiveFileTool(this.workspace),
423
424
  new ScheduleFetchTool(this.workspace),
424
- new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
425
+ new ReleaseTool(this.workspace, { kcVersion: readKcVersion() }),
425
426
  new PhaseAdvanceTool(
426
- (to, reason, opts) => this._advancePhase(to, reason, opts),
427
+ // v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
428
+ // so the tool's refusal text can surface the engine telemetry
429
+ // that motivated the refusal. Internal callers of
430
+ // `_advancePhase` continue to use the bool return value
431
+ // directly; only this lambda wraps for the LLM-facing tool.
432
+ (to, reason, opts) => {
433
+ const advanced = this._advancePhase(to, reason, opts);
434
+ if (!advanced) {
435
+ let engineCounts = null;
436
+ try { engineCounts = this._buildEngineCountsBlock(this.currentPhase); }
437
+ catch { /* defensive */ }
438
+ return { advanced: false, engineCounts };
439
+ }
440
+ return { advanced: true };
441
+ },
427
442
  () => this.currentPhase, // H1: tool reads phase BEFORE its own call
428
443
  // v0.6.2 J1: surface running subagents so the tool can refuse
429
444
  // advance until the agent explicitly acknowledges them.
@@ -1665,7 +1680,24 @@ export class AgentEngine {
1665
1680
  parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
1666
1681
  break;
1667
1682
  }
1668
- // bootstrap / finalization: no specific counters, fall through
1683
+ case "bootstrap": {
1684
+ // v0.7.2 1e: previously fell through to empty string. Both
1685
+ // v0.7.1 audit runs had bootstrap → rule_extraction refusals
1686
+ // with engineCounts: "" — agent saw the refusal but had no
1687
+ // engine telemetry to react to. The InitializerPipeline tracks
1688
+ // boolean checklist flags rather than numeric counters; we
1689
+ // surface those flags as "yes/no" so the agent can see which
1690
+ // bootstrap criterion is missing.
1691
+ if (typeof pipeline.describeBootstrapChecklist === "function") {
1692
+ const cl = pipeline.describeBootstrapChecklist();
1693
+ parts.push(`workspaceCreated: ${cl.workspaceCreated ? "yes" : "no"}`);
1694
+ parts.push(`configReady: ${cl.configReady ? "yes" : "no"}`);
1695
+ parts.push(`hasRegulations: ${cl.hasRegulations ? "yes" : "no"}`);
1696
+ parts.push(`hasSamples: ${cl.hasSamples ? "yes" : "no"}`);
1697
+ }
1698
+ break;
1699
+ }
1700
+ // finalization: no specific counters, fall through
1669
1701
  }
1670
1702
  } catch { /* never let summary build break phase advance */ }
1671
1703
  return parts.join(", ");
@@ -57,16 +57,21 @@ function listChildFiles(p) {
57
57
  // Walk a directory recursively, yielding every file path. Skips hidden
58
58
  // dirs/files and __pycache__. Used by derive functions that need to
59
59
  // match arbitrarily-nested artifacts (e.g., scripts/ subdirs).
60
- function* walkFiles(root) {
60
+ //
61
+ // v0.7.2 1a: optional maxDepth caps recursion. depth=0 is root's
62
+ // direct children; depth=1 is one level down. Default unbounded
63
+ // (existing callers).
64
+ function* walkFiles(root, { maxDepth } = {}) {
61
65
  if (!dirExists(root)) return;
62
- const stack = [root];
66
+ const stack = [{ dir: root, depth: 0 }];
63
67
  while (stack.length) {
64
- const dir = stack.pop();
68
+ const { dir, depth } = stack.pop();
65
69
  for (const e of readDirSafe(dir)) {
66
70
  if (e.name.startsWith(".") || e.name === "__pycache__") continue;
67
71
  const p = path.join(dir, e.name);
68
- if (e.isDirectory()) stack.push(p);
69
- else if (e.isFile()) yield p;
72
+ if (e.isDirectory()) {
73
+ if (maxDepth == null || depth < maxDepth) stack.push({ dir: p, depth: depth + 1 });
74
+ } else if (e.isFile()) yield p;
70
75
  }
71
76
  }
72
77
  }
@@ -250,7 +255,8 @@ export function deriveSkillAuthoringMilestones(workspace) {
250
255
  export function deriveSkillTestingMilestones(workspace) {
251
256
  const cwd = cwdOf(workspace);
252
257
  const skillsDir = path.join(cwd, "rule_skills");
253
- const skillsTested = [];
258
+ // Use a Set so the v0.7.1 1a output/-side scan can add without duplicates.
259
+ const tested = new Set();
254
260
 
255
261
  if (dirExists(skillsDir)) {
256
262
  for (const e of listChildDirs(skillsDir)) {
@@ -266,14 +272,86 @@ export function deriveSkillTestingMilestones(workspace) {
266
272
  fileExists(path.join(skillPath, "assets", "test_cases.json")) ||
267
273
  listChildFiles(skillPath).some((f) =>
268
274
  /^(test|.*_test)_(output|result|log)/i.test(f.name) && f.name.endsWith(".json"));
269
- if (hasTestArtifact) skillsTested.push(e.name);
275
+ if (hasTestArtifact) tested.add(e.name);
276
+ }
277
+ }
278
+
279
+ // v0.7.1 1a / v0.7.2 1a: credit rules whose verdicts appear anywhere
280
+ // under output/*.json. Agents persist batch-test results in
281
+ // conductor-specific shapes (this is the recurring drift point —
282
+ // engine derivation has to match disk reality, not the other way
283
+ // around). Shapes seen across E2E #5/6/7:
284
+ //
285
+ // - DS v0.7.0/0.7.1: catalog.json as array of {id: "R001", ...}
286
+ // entries; skill_test_*.json as {doc_name: {R019a: bool, ...}};
287
+ // skill_test_阳光资产.json with {doc, results: {R019a: ...}}
288
+ // - GLM v0.7.1: rule_stats.json as {D01-01: {PASS, FAIL, NA}, ...};
289
+ // full_test_results_v[1-6].json as {sample_id: {path, meta,
290
+ // results: {D01-01: {verdict, ...}}}} (nested 2 levels deep, why
291
+ // v0.7.1's shallow walk missed them)
292
+ //
293
+ // The collector recurses (depth-limited) and uses two heuristics to
294
+ // separate rule_ids from sample_ids / doc_names:
295
+ // 1. Rule-id shape: starts with letter, ≤ 30 chars, contains digits
296
+ // (matches R001, D01-01, T02-31; rejects 06f2ed1488, doc paths)
297
+ // 2. Verdict-shape on values: {verdict, passed, pass, PASS, FAIL}
298
+ // keys signal that the parent dict's keys are rule_ids
299
+ const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
300
+ const isRuleIdShape = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
301
+ const looksLikeVerdict = (v) =>
302
+ v && typeof v === "object" && !Array.isArray(v) && (
303
+ v.verdict !== undefined ||
304
+ v.passed !== undefined ||
305
+ v.pass !== undefined ||
306
+ typeof v.PASS === "number" ||
307
+ typeof v.FAIL === "number"
308
+ );
309
+ const collectFromJsonFile = (data, depth = 0) => {
310
+ if (!data || depth > 4) return;
311
+ if (typeof data !== "object") return;
312
+ if (Array.isArray(data)) {
313
+ for (const r of data) collectFromJsonFile(r, depth + 1);
314
+ return;
315
+ }
316
+ // {rule_id: "X"} or {id: "R001"} on a rule entry
317
+ if (isRuleIdShape(data.rule_id)) tested.add(data.rule_id);
318
+ if (isRuleIdShape(data.id)) tested.add(data.id);
319
+ // {<rule_id>: <verdict_shaped>, ...} (rule_stats / per-doc test_results)
320
+ for (const [k, v] of Object.entries(data)) {
321
+ if (isRuleIdShape(k) && looksLikeVerdict(v)) tested.add(k);
270
322
  }
323
+ // {results: {<rule_id>: ...}} — keys must look rule-id-shaped
324
+ if (data.results && typeof data.results === "object" && !Array.isArray(data.results)) {
325
+ for (const k of Object.keys(data.results)) {
326
+ if (isRuleIdShape(k)) tested.add(k);
327
+ }
328
+ }
329
+ // Recurse into nested objects (handles {sample_id: {results: {...}}})
330
+ for (const v of Object.values(data)) {
331
+ if (v && typeof v === "object") collectFromJsonFile(v, depth + 1);
332
+ }
333
+ };
334
+
335
+ const outputDir = path.join(cwd, "output");
336
+ for (const p of walkFiles(outputDir, { maxDepth: 6 })) {
337
+ if (!p.endsWith(".json")) continue;
338
+ collectFromJsonFile(readJsonSafe(p));
339
+ }
340
+
341
+ // DS v070 wrote a top-level aggregate at either rules/test_results.json
342
+ // OR rule_skills/test_results.json. Both seen in the wild; check both.
343
+ for (const candidate of [
344
+ path.join(cwd, "rules", "test_results.json"),
345
+ path.join(cwd, "rule_skills", "test_results.json"),
346
+ path.join(cwd, "test_results.json"),
347
+ ]) {
348
+ if (fileExists(candidate)) collectFromJsonFile(readJsonSafe(candidate));
271
349
  }
272
350
 
273
351
  // skillsPassing — per-skill accuracy threshold. Without a uniform
274
352
  // schema across agent outputs we report `tested` as the floor; the
275
353
  // pipeline's existing _loadTestResults() can layer accuracy on top.
276
- return { skillsTested };
354
+ return { skillsTested: [...tested] };
277
355
  }
278
356
 
279
357
  // ───────────────────────────────────────────────────────────────────
@@ -205,6 +205,33 @@ export class RuleExtractionPipeline extends Pipeline {
205
205
  return `workflow_run is SKILL_TESTING/PRODUCTION_QC-phase work, but engine is in RULE_EXTRACTION. ${exitText}`;
206
206
  }
207
207
 
208
+ // v0.7.1 2a/2b: when agent attempts phase_advance from rule_extraction,
209
+ // surface advisories for the two soft-but-load-bearing artifacts the
210
+ // gate criteria require (chunk_refs and coverage_audit). v0.7.0 GLM
211
+ // session forced through with both missing — gate refused for the
212
+ // right reason but the refusal text was generic. Name them inline.
213
+ if (toolName === "phase_advance" && toolInput?.to === "skill_authoring") {
214
+ const advisories = [];
215
+ if (this.rulesExtracted.length > 0 && this.rulesWithChunkRefs.length === 0) {
216
+ advisories.push(
217
+ `Advancing rule_extraction with rulesWithChunkRefs=0/${this.rulesExtracted.length}. ` +
218
+ `The skill_authoring phase's prompts use source_chunk_ids to ground ` +
219
+ `skill explanations against regulation text. Without them, skill authoring ` +
220
+ `runs blind. Either populate chunk refs via the rule_catalog tool, or ` +
221
+ `accept that skill_authoring's generated content won't cite source regulation.`,
222
+ );
223
+ }
224
+ if (this.rulesExtracted.length > 0 && !this.coverageAudited) {
225
+ advisories.push(
226
+ `Advancing rule_extraction without rules/coverage_audit.md (or .json). ` +
227
+ `Coverage audit identifies regulation articles you didn't extract a rule ` +
228
+ `for — without it, gaps go silent through to production. If your ` +
229
+ `extraction is genuinely complete, write a one-paragraph audit confirming so.`,
230
+ );
231
+ }
232
+ if (advisories.length > 0) return advisories.join("\n\n");
233
+ }
234
+
208
235
  return null;
209
236
  }
210
237
 
@@ -5,6 +5,7 @@ import { PipelineEvent } from "./index.js";
5
5
  import { Pipeline } from "./base.js";
6
6
  import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
7
7
  import { deriveFinalizationMilestones } from "./_milestone-derive.js";
8
+ import { readKcVersion } from "../../util/kc-version.js";
8
9
 
9
10
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
11
  // v0.7.0 N: ship template/release/v1/ from the npm package; copy into
@@ -310,12 +311,7 @@ export class FinalizationPipeline extends Pipeline {
310
311
  }
311
312
 
312
313
  _readKcVersion() {
313
- try {
314
- const pkg = JSON.parse(fs.readFileSync(
315
- path.resolve(__dirname, "../../../package.json"), "utf-8",
316
- ));
317
- return pkg.version || "unknown";
318
- } catch { return "unknown"; }
314
+ return readKcVersion();
319
315
  }
320
316
 
321
317
  /**
@@ -240,6 +240,19 @@ export class ProjectInitializer extends Pipeline {
240
240
  return this.workspaceCreated && this.configReady && this.hasRegulations && this.hasSamples;
241
241
  }
242
242
 
243
+ // v0.7.2 1e: surface the checklist as engine telemetry so
244
+ // `_buildEngineCountsBlock("bootstrap")` has something to report when
245
+ // bootstrap → rule_extraction is refused. Agent sees the missing
246
+ // criteria directly in the refusal text.
247
+ describeBootstrapChecklist() {
248
+ return {
249
+ workspaceCreated: !!this.workspaceCreated,
250
+ configReady: !!this.configReady,
251
+ hasRegulations: !!this.hasRegulations,
252
+ hasSamples: !!this.hasSamples,
253
+ };
254
+ }
255
+
243
256
  /**
244
257
  * v0.6.3 (#74): nudge the agent when it does work that belongs to a later
245
258
  * phase. Bootstrap is setup — reading rules/samples, configuring keys,
@@ -14,6 +14,11 @@ export class SkillTestingPipeline extends Pipeline {
14
14
  this.iterationCount = 0;
15
15
  this._accuracyThreshold = 0.9;
16
16
  this._maxIterations = 20;
17
+ // v0.7.1 1b: rate-limit phaseMisfitHint firing for ephemeral
18
+ // sandbox tests. Caps at ~3 nudges per phase entry so the agent
19
+ // sees the path expectation but doesn't get spammed during a
20
+ // batch run.
21
+ this._misfit_nudge_count = 0;
17
22
  this._scanWorkspace();
18
23
  }
19
24
 
@@ -132,6 +137,12 @@ export class SkillTestingPipeline extends Pipeline {
132
137
  * v0.6.3 (#74): SKILL_TESTING runs check scripts against test samples and
133
138
  * measures accuracy. Writing distillation outputs or production results
134
139
  * here means phase boundaries got skipped.
140
+ *
141
+ * v0.7.1 1b: also nudges agents who run check scripts via sandbox_exec
142
+ * but don't persist verdicts. E2E #6 v070 surfaced this — both
143
+ * conductors batched tests in one sandbox_exec call, read pass/fail
144
+ * from stdout, then declared "testing done" while engine saw
145
+ * skillsTested=0 because nothing landed in a recognized path.
135
146
  */
136
147
  phaseMisfitHint(toolName, toolInput, result) {
137
148
  if (result?.isError) return null;
@@ -148,6 +159,34 @@ export class SkillTestingPipeline extends Pipeline {
148
159
  return `Writing under output/results/ is PRODUCTION_QC-phase work, but engine is in SKILL_TESTING. ${exitText}`;
149
160
  }
150
161
  }
162
+
163
+ // v0.7.1 1b: sandbox_exec test-command nudge
164
+ if (toolName === "sandbox_exec") {
165
+ const cmd = String(toolInput?.command || "");
166
+ const looksLikeTest =
167
+ /python.*check.*\.py.*\.(txt|pdf|md|docx)/i.test(cmd) ||
168
+ /pytest|unittest|run_tests/i.test(cmd) ||
169
+ /python.*workflow.*\.py.*samples/i.test(cmd);
170
+ if (!looksLikeTest) return null;
171
+
172
+ const tested = Object.keys(this.skillsTested).length;
173
+ const total = this.skillsToTest.length;
174
+ // Already satisfied? Don't nudge.
175
+ if (total === 0 || tested >= total) return null;
176
+
177
+ // Rate-limit: ~3 per phase. Counter resets on phase entry
178
+ // (constructor) and on importState if available.
179
+ this._misfit_nudge_count = (this._misfit_nudge_count || 0) + 1;
180
+ if (this._misfit_nudge_count > 3) return null;
181
+
182
+ return (
183
+ `Engine derives skillsTested from rule_skills/<id>/test_results.json, ` +
184
+ `rule_skills/<id>/tests/, OR output/*.json with rule_id field. ` +
185
+ `Sandbox runs are ephemeral — record per-rule verdicts to one of ` +
186
+ `those paths before phase_advance. Currently engine sees ` +
187
+ `${tested}/${total} skills tested.`
188
+ );
189
+ }
151
190
  return null;
152
191
  }
153
192
 
@@ -15,7 +15,11 @@ const VALID_PHASES = new Set(Object.values(Phase));
15
15
  */
16
16
  export class PhaseAdvanceTool extends BaseTool {
17
17
  /**
18
- * @param {(to: string, reason: string, opts: {force?: boolean}) => boolean} advanceFn
18
+ * @param {(to: string, reason: string, opts: {force?: boolean}) => {advanced: boolean, engineCounts?: string}} advanceFn
19
+ * v0.7.1 2c: returns the rich object so the tool can surface engine
20
+ * telemetry in the refusal text. Internal engine callers of
21
+ * `_advancePhase` still get the bool; only this LLM-facing tool
22
+ * uses the wrapped form.
19
23
  * @param {() => string} getCurrentPhaseFn - H1: lets the tool read the
20
24
  * engine's phase BEFORE the call, so it can distinguish "already there"
21
25
  * (silent no-op, informational) from "non-adjacent refusal" (actionable).
@@ -91,7 +95,11 @@ export class PhaseAdvanceTool extends BaseTool {
91
95
  );
92
96
  }
93
97
 
94
- const advanced = this._advance(to, input.reason || "agent request", { force: !!input.force });
98
+ // v0.7.1 2c: advanceFn returns {advanced, engineCounts?} so we can
99
+ // surface telemetry in the refusal text below. Internal callers of
100
+ // _advancePhase still get bool; only this LLM-facing tool unwraps.
101
+ const advanceResult = this._advance(to, input.reason || "agent request", { force: !!input.force });
102
+ const advanced = !!advanceResult?.advanced;
95
103
  if (advanced) {
96
104
  // Log the ack so post-mortems can find phase advances that proceeded
97
105
  // with live subagents
@@ -113,9 +121,18 @@ export class PhaseAdvanceTool extends BaseTool {
113
121
  // immediately (12/12 transitions). The escape valve remains in the input
114
122
  // schema (discoverable) but isn't hand-fed to the LLM here. Instead,
115
123
  // direct the agent at the missing milestones it can satisfy.
124
+ //
125
+ // v0.7.1 2c: include engineCounts when available so the agent sees
126
+ // exactly which milestones the gate is reading and can satisfy them.
127
+ // E2E #6 v070 showed the generic "check /status" hint wasn't concrete
128
+ // enough — agents forced through. Naming the gap inline reduces that.
129
+ const engineCountsLine = advanceResult?.engineCounts
130
+ ? `\nEngine telemetry: ${advanceResult.engineCounts}`
131
+ : "";
132
+
116
133
  return new ToolResult(
117
134
  `Did not advance to ${to} (currently in ${beforePhase || "?"}). ` +
118
- `Likely cause: source-phase exit criteria not met. ` +
135
+ `Likely cause: source-phase exit criteria not met.${engineCountsLine}\n\n` +
119
136
  `Run /status (or read the phase describeState block in this turn's system reminder) ` +
120
137
  `to see which milestones are missing, then produce the disk artifacts that satisfy them — ` +
121
138
  `the engine derives milestones from filesystem facts (rule_skills/<id>/SKILL.md, check.py, ` +
@@ -178,7 +178,21 @@ export class ReleaseTool extends BaseTool {
178
178
  path.join(bundleAbs, "glossary.json"), { fallback: '{"version":1,"entries":[]}\n' });
179
179
  this._copyIfExists(path.join(this._workspace.cwd, "corner_cases.json"),
180
180
  path.join(bundleAbs, "corner_cases.json"), { fallback: '[]\n' });
181
- this._copyIfExists(path.join(this._workspace.cwd, "confidence_calibration.json"),
181
+ // v0.7.2 1c: auto-aggregate from output/ if no calibration file at
182
+ // workspace root. Both v0.7.1 audit runs (DS + GLM) shipped releases
183
+ // with empty `historical_accuracy: {}` despite having per-rule QC
184
+ // data on disk under output/ — the release tool just passed the
185
+ // file through and emitted a stub on miss. We try to populate from
186
+ // known QC artifact shapes here; if nothing matches, fall through
187
+ // to the existing stub fallback.
188
+ const calibSrc = path.join(this._workspace.cwd, "confidence_calibration.json");
189
+ if (!fs.existsSync(calibSrc)) {
190
+ const aggregated = this._aggregateAccuracyFromOutput();
191
+ if (aggregated && Object.keys(aggregated.historical_accuracy).length > 0) {
192
+ fs.writeFileSync(calibSrc, JSON.stringify(aggregated, null, 2) + "\n", "utf-8");
193
+ }
194
+ }
195
+ this._copyIfExists(calibSrc,
182
196
  path.join(bundleAbs, "confidence_calibration.json"),
183
197
  { fallback: '{"historical_accuracy":{}}\n' });
184
198
 
@@ -233,6 +247,30 @@ export class ReleaseTool extends BaseTool {
233
247
  .replace(/\{RULES_LIST\}/g, rulesList);
234
248
  fs.writeFileSync(path.join(bundleAbs, "README.md"), readme, "utf-8");
235
249
 
250
+ // v0.7.2 1d: clean up the template scaffold dir if a customized
251
+ // release was just written alongside it. Both v0.7.1 audit runs
252
+ // shipped with `output/releases/v1/` (template-derived, .tmpl
253
+ // files lingering) AND `output/releases/v1-0/` (or v1-0-hybrid/)
254
+ // — the customized release. The pre-scaffold is meant as a hint;
255
+ // once the agent calls `release(label="v1-0")` and we've written
256
+ // the real bundle, the unedited scaffold is just clutter.
257
+ //
258
+ // Conservative gate: only delete a sibling `v1/` if BOTH (a) we
259
+ // didn't just write to v1/ ourselves, AND (b) it still contains
260
+ // .tmpl files (signature of unedited template). If the agent
261
+ // intentionally edited v1/ in place (removing .tmpl), our cleanup
262
+ // leaves it alone.
263
+ if (slug !== "v1") {
264
+ const tmplScaffold = path.join(this._workspace.resolvePath(path.join("output", "releases")), "v1");
265
+ if (fs.existsSync(tmplScaffold) && fs.statSync(tmplScaffold).isDirectory()) {
266
+ let hasTmpl = false;
267
+ try { hasTmpl = fs.readdirSync(tmplScaffold).some((f) => f.endsWith(".tmpl")); } catch { /* ignore */ }
268
+ if (hasTmpl) {
269
+ try { fs.rmSync(tmplScaffold, { recursive: true, force: true }); } catch { /* best-effort */ }
270
+ }
271
+ }
272
+ }
273
+
236
274
  // Bundle dir is in output/ (gitignored). Snapshot manifest in snapshots/ IS tracked.
237
275
  const lines = [
238
276
  `Release '${label}' bundled at ${bundleRel}`,
@@ -319,6 +357,118 @@ export class ReleaseTool extends BaseTool {
319
357
  return null;
320
358
  }
321
359
 
360
+ // v0.7.2 1c: walk output/ for QC artifacts and aggregate per-rule
361
+ // accuracy. Recognized shapes (covering DS + GLM v0.7.1 audit runs):
362
+ //
363
+ // rule_stats_v*.json — {<rule_id>: {PASS: N, FAIL: N, NOT_APPLICABLE: N, ERROR: N}}
364
+ // (GLM produced 4 versions; pick the highest)
365
+ // full_test_results_v*.json — {<sample_id>: {results: {<rule_id>: {verdict}}}}
366
+ // (GLM; accumulate verdicts per rule across samples)
367
+ // skill_test_*.json — {<doc_name>: {<rule_id>: bool}} (DS shape)
368
+ //
369
+ // Returns null if no recognized artifact, or an object with
370
+ // { historical_accuracy: {<rule_id>: {pass_rate, n_samples, ...}}, computed_at, source_files }
371
+ // suitable for confidence_calibration.json.
372
+ _aggregateAccuracyFromOutput() {
373
+ const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
374
+ const isRuleId = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
375
+ const tally = new Map(); // rule_id -> {pass, fail, na, n}
376
+ const sourceFiles = [];
377
+ const bump = (rid, kind) => {
378
+ if (!isRuleId(rid)) return;
379
+ const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
380
+ t[kind] += 1;
381
+ t.n += 1;
382
+ tally.set(rid, t);
383
+ };
384
+ const outputDir = path.join(this._workspace.cwd, "output");
385
+ if (!fs.existsSync(outputDir)) return null;
386
+
387
+ // Collect all .json files under output/ (depth limited)
388
+ const files = [];
389
+ const stack = [{ dir: outputDir, depth: 0 }];
390
+ while (stack.length) {
391
+ const { dir, depth } = stack.pop();
392
+ let entries;
393
+ try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { continue; }
394
+ for (const e of entries) {
395
+ if (e.name.startsWith(".") || e.name === "__pycache__") continue;
396
+ const p = path.join(dir, e.name);
397
+ if (e.isDirectory()) {
398
+ if (depth < 6) stack.push({ dir: p, depth: depth + 1 });
399
+ } else if (e.isFile() && e.name.endsWith(".json")) {
400
+ files.push({ path: p, name: e.name });
401
+ }
402
+ }
403
+ }
404
+
405
+ // 1) Prefer rule_stats_v<N>.json (highest version) — direct counts
406
+ const ruleStatsFiles = files
407
+ .filter((f) => /^rule_stats(?:_v\d+)?\.json$/i.test(f.name))
408
+ .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
409
+ .sort((a, b) => b.ver - a.ver);
410
+ if (ruleStatsFiles.length > 0) {
411
+ const top = ruleStatsFiles[0];
412
+ try {
413
+ const d = JSON.parse(fs.readFileSync(top.path, "utf-8"));
414
+ for (const [rid, stats] of Object.entries(d)) {
415
+ if (!isRuleId(rid) || !stats || typeof stats !== "object") continue;
416
+ const pass = stats.PASS | 0, fail = stats.FAIL | 0;
417
+ const na = stats.NOT_APPLICABLE | stats.NA | 0;
418
+ const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
419
+ t.pass += pass; t.fail += fail; t.na += na; t.n += pass + fail + na;
420
+ tally.set(rid, t);
421
+ }
422
+ sourceFiles.push(path.relative(this._workspace.cwd, top.path));
423
+ } catch { /* fall through to other shapes */ }
424
+ }
425
+
426
+ // 2) Fallback: full_test_results*.json with nested {sample_id: {results: {rid: {verdict}}}}
427
+ if (tally.size === 0) {
428
+ const ftrFiles = files
429
+ .filter((f) => /^full_test_results(?:_v\d+)?\.json$/i.test(f.name))
430
+ .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
431
+ .sort((a, b) => b.ver - a.ver);
432
+ for (const f of ftrFiles.slice(0, 1)) {
433
+ try {
434
+ const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
435
+ for (const sample of Object.values(d)) {
436
+ if (!sample || typeof sample !== "object") continue;
437
+ const results = sample.results;
438
+ if (!results || typeof results !== "object") continue;
439
+ for (const [rid, r] of Object.entries(results)) {
440
+ if (!isRuleId(rid) || !r || typeof r !== "object") continue;
441
+ const verdict = (r.verdict || "").toString().toUpperCase();
442
+ if (verdict === "PASS") bump(rid, "pass");
443
+ else if (verdict === "FAIL") bump(rid, "fail");
444
+ else if (verdict === "NOT_APPLICABLE" || verdict === "NA") bump(rid, "na");
445
+ }
446
+ }
447
+ sourceFiles.push(path.relative(this._workspace.cwd, f.path));
448
+ } catch { /* try next shape */ }
449
+ }
450
+ }
451
+
452
+ if (tally.size === 0) return null;
453
+
454
+ const historical_accuracy = {};
455
+ for (const [rid, t] of tally.entries()) {
456
+ const fired = t.pass + t.fail;
457
+ historical_accuracy[rid] = {
458
+ pass_rate: fired > 0 ? +(t.pass / fired).toFixed(4) : null,
459
+ n_passed: t.pass,
460
+ n_failed: t.fail,
461
+ n_not_applicable: t.na,
462
+ n_samples: t.n,
463
+ };
464
+ }
465
+ return {
466
+ historical_accuracy,
467
+ computed_at: new Date().toISOString(),
468
+ source_files: sourceFiles,
469
+ };
470
+ }
471
+
322
472
  _readWorkerTiers() {
323
473
  const envPath = path.join(this._workspace.cwd, ".env");
324
474
  const out = { tier1: "", tier2: "", tier3: "", tier4: "" };
@@ -0,0 +1,27 @@
1
+ // Single source of truth for the live KC CLI version string.
2
+ //
3
+ // Reads package.json once. Used by engine.js (passed to ReleaseTool so
4
+ // release manifests stamp the correct version) and by
5
+ // pipelines/finalization.js (anywhere it surfaces "Built by kc-beta X").
6
+ //
7
+ // Before v0.7.2, engine.js hardcoded `kcVersion: "0.5.2"` which leaked
8
+ // into every release manifest's `kc_beta_version` field regardless of
9
+ // the actual package version. Both v0.7.1 audit runs (DS + GLM)
10
+ // surfaced this. Reading package.json closes the gap.
11
+
12
+ import fs from "node:fs";
13
+ import path from "node:path";
14
+ import { fileURLToPath } from "node:url";
15
+
16
+ const __filename = fileURLToPath(import.meta.url);
17
+ const __dirname = path.dirname(__filename);
18
+
19
+ export function readKcVersion() {
20
+ try {
21
+ const pkgPath = path.resolve(__dirname, "../../package.json");
22
+ const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
23
+ return pkg.version || "unknown";
24
+ } catch {
25
+ return "unknown";
26
+ }
27
+ }
@@ -223,6 +223,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
223
223
 
224
224
  Do not skip ambiguous rules. They are often the most important ones.
225
225
 
226
+ ## Sanity-check applicability against the sample corpus
227
+
228
+ After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
229
+
230
+ For every rule:
231
+ 1. Walk `samples/`, classify each by product type / report type / document format
232
+ 2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
233
+ 3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
234
+
235
+ E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
236
+
237
+ If many rules are 0-sample, either:
238
+ - **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
239
+ - **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
240
+ - **Update the test corpus** to include matching samples (work with the developer user)
241
+
242
+ Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
243
+
226
244
  ## When Rules Change
227
245
 
228
246
  Regulations evolve. When the developer user adds new or updated regulation documents:
@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
45
45
  ### The hybrid approach (most common)
46
46
  Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
47
47
 
48
+ ### When regex alone isn't enough — decision rubric
49
+
50
+ Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
51
+
52
+ - **Semantic** ("is this a positive guarantee or a disclaimer?")
53
+ - **Contextual** ("interpret this in light of the document's product type")
54
+ - **Counterfactual** ("what should this value be, given the other fields?")
55
+ - **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
56
+
57
+ regex alone rarely suffices. Three acceptable forms:
58
+
59
+ 1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
60
+ 2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
61
+ 3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
62
+
63
+ Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
64
+
65
+ ### Worker LLM cost-aware tier choice
66
+
67
+ If you do escalate to LLM:
68
+ - **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
69
+ - **tier2-3**: bulk extraction with simple semantic checks
70
+ - **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
71
+
72
+ Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
73
+
48
74
  ## Workflow Structure
49
75
 
50
76
  A workflow is a Python file (or small set of files) in `workflows/`:
@@ -101,6 +101,87 @@ The v0.6.2 D2 anti-pattern wording captures the failure case clearly:
101
101
 
102
102
  That came from E2E #4 where one conductor wrote a 2,400-line `unified_qc.py` that ran all rules at once. It produced 1,150 ERROR verdicts (16.6%) because every rule's failure cascaded into every other rule's verdict. Per-rule skills are KC's unit of granularity for a reason.
103
103
 
104
+ ### Anti-pattern: stub check.py + real workflow.py
105
+
106
+ Do NOT make `rule_skills/<id>/check.py` a stub that defers to
107
+ `workflows/<id>/workflow.py`. KC's intent: SKILL.md + check.py is the
108
+ **canonical** verification. workflow.py is the **distilled, cheaper**
109
+ form (regex baseline + LLM fallback). The relationship is
110
+ skill → workflow, not workflow → skill.
111
+
112
+ ❌ DON'T:
113
+ ```python
114
+ # rule_skills/R001/check.py — STUB, real logic elsewhere
115
+ def check(text):
116
+ rule_ids = re.findall(r"R\d{3}", load_skill())
117
+ return {rid: {"pass": None, "method": "stub",
118
+ "note": "to be implemented later"} for rid in rule_ids}
119
+ # real verification logic only in workflows/R001/workflow_v1.py
120
+ ```
121
+
122
+ ✅ DO:
123
+ ```python
124
+ # rule_skills/R001/check.py — canonical verification
125
+ def check(text):
126
+ matches = re.findall(r"...", text) # actual rule logic
127
+ return {"rule_id": "R001", "passed": bool(matches),
128
+ "evidence": matches[:3], "method": "regex"}
129
+
130
+ # workflows/R001/workflow_v1.py — distilled, cheaper form
131
+ def run(text, llm_fn=None):
132
+ result = check(text) # baseline from skill
133
+ if not result["passed"] and llm_fn:
134
+ result = llm_verify(text, llm_fn) # escalate on fail
135
+ return result
136
+ ```
137
+
138
+ Why it matters: distillation phase consumers (release tool, run.py
139
+ harness) load workflow.py. If check.py is a stub, the skill's
140
+ methodology (SKILL.md) becomes documentation-only and the
141
+ verification logic is scattered across N workflow files. Future
142
+ iterations of the skill (changes to regulation interpretation, edge
143
+ cases discovered in production) need a single canonical place to
144
+ update — the skill — not N workflows that have drifted independently.
145
+
146
+ E2E #6 v070 surfaced this pattern (DS bundled-skill check.py files
147
+ all returned `{"pass": null, "method": "stub"}` deferring to
148
+ workflows/). v0.7.1 added this anti-pattern explicitly.
149
+
150
+ E2E #7 v071 showed the teaching prevented the stub anti-pattern in
151
+ both conductors (no `{"pass": null}` patterns in either run), but
152
+ **DS still inverted the canonical-vs-distilled relationship**: DS's
153
+ 6 thematic skill folders had SKILL.md only (no check.py), with the
154
+ real verification code living in `workflows/<skill>/check.py`. The
155
+ absence of stubs is good; the inversion is not — editing a rule then
156
+ requires touching both SKILL.md (the doc) and the workflow check.py
157
+ (the code). Single source of truth is lost.
158
+
159
+ GLM v071 by contrast landed the canonical pattern: 97/97 skills had
160
+ both SKILL.md AND a real `check.py` (median 143 LOC of regex +
161
+ applicability logic), and `workflows/<id>/workflow_v1.py` was a
162
+ 50-line thin wrapper that imported and called it:
163
+
164
+ ```python
165
+ # workflows/D01-01/workflow_v1.py — thin wrapper, 52 LOC
166
+ import importlib.util, json
167
+ from pathlib import Path
168
+
169
+ def run(doc_text: str, meta: dict = None) -> dict:
170
+ check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
171
+ spec = importlib.util.spec_from_file_location("check", check_path)
172
+ mod = importlib.util.module_from_spec(spec)
173
+ spec.loader.exec_module(mod)
174
+ result = mod.check(doc_text, meta)
175
+ result["_workflow"] = "D01-01_v1"
176
+ return result
177
+ ```
178
+
179
+ This is the v0.7.2+ canonical pattern: workflow is a shim that
180
+ points at the skill's check.py. To iterate on a rule's verification,
181
+ edit `rule_skills/<id>/check.py`. The workflow doesn't change. v0.7.2
182
+ clarifies the teaching: avoid stubs AND keep the canonical
183
+ relationship (skill is canonical, workflow is distilled wrapper).
184
+
104
185
  ### Naming convention for grouped checks
105
186
 
106
187
  When you do bundle, name the file with the explicit range:
@@ -263,4 +344,26 @@ When entering skill_authoring with an empty TaskBoard:
263
344
  5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
264
345
  6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
265
346
 
266
- The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.
347
+ ### Persisted methodology PATTERNS.md OR phase logs OR AGENT.md decisions
348
+
349
+ The principle: capture framework-level decisions to disk before each phase advance. The conversation will compact, agents will restart, the next phase will lose grounding. Whichever format you pick, write to disk — don't rely on conversation context that disappears.
350
+
351
+ Three formats, each defensible. Pick one and stick with it:
352
+
353
+ - **`rules/PATTERNS.md`** — concise, framework-only, updated as the project progresses. Best for greenfield projects with clear hypothesis-up-front structure. Capped at ~5 KB; entries are transferable shapes / project constraints / anti-patterns with rationale (see "What to write" above).
354
+
355
+ - **`logs/phase_<name>_complete.md` per phase** — incremental, captures what each phase produced + decisions made + what the next phase inherits. Best for iterative discovery work where the framework crystallizes mid-run. E2E #7 GLM used this pattern across 6 phase docs and an `evolution_summary_v1.2.md`; the methodology was captured even though PATTERNS.md was never written.
356
+
357
+ - **`AGENT.md` decisions section + domain notes** — narrative-style, living document of "what we know" and "why". Best for projects with rich domain context to capture (regulations, edge cases, thresholds, sample format distributions). E2E #7 GLM's AGENT.md included regulation enforcement dates, product type taxonomies, threshold values, and sample format counts — this is fine; it's a different idiom for the same goal.
358
+
359
+ What you should NOT do: skip persistence and rely only on the live conversation context. By the time you have N skills authored without any persisted methodology, you've made N implicit decisions about verdict shape, chunker boundaries, and worker tier. Each rule re-derives from scratch. Refactoring requires touching N files instead of one.
360
+
361
+ ❌ "I'll capture insights when I have time."
362
+
363
+ ✅ "Before each phase advance, write what I learned to whichever persistence file matches this project's idiom — even if it's tentative."
364
+
365
+ E2E history:
366
+ - E2E #6 v070 DS wrote PATTERNS.md only after a rollback. Per-skill decisions before that point had to be re-touched. v0.7.1 added "PATTERNS.md FIRST" reinforcement.
367
+ - E2E #7 v071 neither DS nor GLM wrote PATTERNS.md, but GLM wrote 6 rich phase-completion logs and a comprehensive AGENT.md — the methodology WAS captured, just in different files. v0.7.2 blesses the broader principle: persist before you advance, format flexible.
368
+
369
+ The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract; the persistence file is your project's memory.
@@ -222,6 +222,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
222
222
 
223
223
  Do not skip ambiguous rules. They are often the most important ones.
224
224
 
225
+ ## Sanity-check applicability against the sample corpus
226
+
227
+ After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
228
+
229
+ For every rule:
230
+ 1. Walk `samples/`, classify each by product type / report type / document format
231
+ 2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
232
+ 3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
233
+
234
+ E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
235
+
236
+ If many rules are 0-sample, either:
237
+ - **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
238
+ - **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
239
+ - **Update the test corpus** to include matching samples (work with the developer user)
240
+
241
+ Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
242
+
225
243
  ## When Rules Change
226
244
 
227
245
  Regulations evolve. When the developer user adds new or updated regulation documents:
@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
45
45
  ### The hybrid approach (most common)
46
46
  Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
47
47
 
48
+ ### When regex alone isn't enough — decision rubric
49
+
50
+ Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
51
+
52
+ - **Semantic** ("is this a positive guarantee or a disclaimer?")
53
+ - **Contextual** ("interpret this in light of the document's product type")
54
+ - **Counterfactual** ("what should this value be, given the other fields?")
55
+ - **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
56
+
57
+ regex alone rarely suffices. Three acceptable forms:
58
+
59
+ 1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
60
+ 2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
61
+ 3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
62
+
63
+ Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
64
+
65
+ ### Worker LLM cost-aware tier choice
66
+
67
+ If you do escalate to LLM:
68
+ - **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
69
+ - **tier2-3**: bulk extraction with simple semantic checks
70
+ - **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
71
+
72
+ Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
73
+
48
74
  ## Workflow Structure
49
75
 
50
76
  A workflow is a Python file (or small set of files) in `workflows/`:
@@ -101,6 +101,82 @@ v0.6.2 D2 的反模式说法已经把失败情形说得很清楚了:
101
101
 
102
102
  那段话来自 E2E #4:一个指挥模型写了 2,400 行 `unified_qc.py` 一次性跑所有规则。结果出现 1,150 条 ERROR verdict(16.6%),因为每条规则的失败都连带把所有其他规则的判定也带崩了。Per-rule skill 是 KC 的粒度单元,这是有原因的。
103
103
 
104
+ ### 反模式:check.py 是 stub + workflow.py 才是真逻辑
105
+
106
+ **不要**把 `rule_skills/<id>/check.py` 写成一个把真实逻辑推迟到
107
+ `workflows/<id>/workflow.py` 的占位文件。KC 的设计意图是:SKILL.md
108
+ + check.py 是**正典**核查;workflow.py 是**蒸馏后、更便宜**的形式
109
+ (regex 优先 + LLM 回退)。关系是 skill → workflow,不是反过来。
110
+
111
+ ❌ 不要这样:
112
+ ```python
113
+ # rule_skills/R001/check.py —— STUB,真逻辑在别处
114
+ def check(text):
115
+ rule_ids = re.findall(r"R\d{3}", load_skill())
116
+ return {rid: {"pass": None, "method": "stub",
117
+ "note": "待技能测试阶段实现"} for rid in rule_ids}
118
+ # 实际核查逻辑只在 workflows/R001/workflow_v1.py 里
119
+ ```
120
+
121
+ ✅ 应该这样:
122
+ ```python
123
+ # rule_skills/R001/check.py —— 正典核查
124
+ def check(text):
125
+ matches = re.findall(r"...", text) # 真实规则逻辑
126
+ return {"rule_id": "R001", "passed": bool(matches),
127
+ "evidence": matches[:3], "method": "regex"}
128
+
129
+ # workflows/R001/workflow_v1.py —— 蒸馏后的便宜形式
130
+ def run(text, llm_fn=None):
131
+ result = check(text) # skill 提供基线
132
+ if not result["passed"] and llm_fn:
133
+ result = llm_verify(text, llm_fn) # FAIL 时升级到 LLM
134
+ return result
135
+ ```
136
+
137
+ 为什么重要:蒸馏阶段下游消费者(release 工具、run.py 运行器)加载
138
+ 的是 workflow.py。如果 check.py 是 stub,skill 的方法论(SKILL.md)
139
+ 就只剩文档作用,而核查逻辑被分散到 N 个 workflow 文件里。后续对
140
+ skill 的迭代(法规解释变化、生产中发现的边缘情形)需要一个**正典
141
+ 位置**来更新——也就是 skill——而不是 N 个已经各自漂移的 workflow。
142
+
143
+ E2E #6 v070 暴露了这个反模式(DS 把所有 bundled skill 的 check.py
144
+ 都写成 `{"pass": null, "method": "stub"}` 推给 workflows/)。
145
+ v0.7.1 把这个反模式显式写进 skill。
146
+
147
+ E2E #7 v071 显示这个反 stub 的引导在两个 conductor 上都生效(两条 run
148
+ 里都没有 `{"pass": null}` 这种 stub 模式),但是 **DS 仍然把"正典 vs
149
+ 蒸馏"的关系搞反了**:DS 写了 6 个主题分组的 skill 文件夹,每个只有
150
+ SKILL.md(没有 check.py),真正的验证代码却在
151
+ `workflows/<skill>/check.py` 里。没有 stub 是好事;关系搞反不是 ——
152
+ 要修改一条规则的逻辑就得同时改 SKILL.md(文档)和 workflow check.py
153
+ (代码),单一信息源就丢了。
154
+
155
+ GLM v071 反而把正典模式落地了:97/97 个 skill 都同时有 SKILL.md 和
156
+ 真正的 `check.py`(regex + 适用性判断的代码,中位 143 行),而
157
+ `workflows/<id>/workflow_v1.py` 是一个 50 行的薄壳,只是 import 并
158
+ 调用 skill 的 check.py:
159
+
160
+ ```python
161
+ # workflows/D01-01/workflow_v1.py — 薄壳,52 行
162
+ import importlib.util, json
163
+ from pathlib import Path
164
+
165
+ def run(doc_text: str, meta: dict = None) -> dict:
166
+ check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
167
+ spec = importlib.util.spec_from_file_location("check", check_path)
168
+ mod = importlib.util.module_from_spec(spec)
169
+ spec.loader.exec_module(mod)
170
+ result = mod.check(doc_text, meta)
171
+ result["_workflow"] = "D01-01_v1"
172
+ return result
173
+ ```
174
+
175
+ 这是 v0.7.2+ 的正典模式:workflow 是个壳,指向 skill 的 check.py。
176
+ 迭代规则验证逻辑时,编辑 `rule_skills/<id>/check.py`,workflow 不用动。
177
+ v0.7.2 把引导说得更清楚:既不要 stub,也要保留正典关系(skill 是
178
+ 正典,workflow 是蒸馏过的薄壳)。
179
+
104
180
  ### 合并 check 的命名约定
105
181
 
106
182
  确实需要合并时,文件名要把范围写明:
@@ -261,4 +337,26 @@ PATTERNS.md 全文控制在约 5 KB 之内。超过时,剪掉最不可执行
261
337
  5. **挑第一个任务**。做到完整(skill + check + 至少一次本地测试)。把学到的写进 PATTERNS.md。换下一个任务。
262
338
  6. **任务做到第 5 个、第 10 个时**:停下来重读 PATTERNS.md。如果新积累的 pattern 暗示要重构早期工作,**现在做**(便宜)而不是更晚(昂贵)。
263
339
 
264
- 引擎从文件系统推导里程碑(v0.7.0 Group A)会按磁盘事实核验覆盖率,无论你怎么切分工作。TaskBoard 是你的草稿;磁盘才是契约。
340
+ ### 持久化方法论 —— PATTERNS.md phase 日志 或 AGENT.md decisions
341
+
342
+ 原则:在每次 phase 推进之前,把框架级的决定写到磁盘。对话会被 compact、agent 会重启、下一个 phase 会失去上下文。不管你选哪种格式,**写到磁盘** —— 不要依赖会消失的对话上下文。
343
+
344
+ 三种格式都站得住,挑一种坚持下去:
345
+
346
+ - **`rules/PATTERNS.md`** —— 简洁,只装框架级内容,随项目推进而更新。适合假设可以前置、结构清晰的全新项目。上限 ~5 KB;条目是可迁移的形状 / 项目级约束 / 反模式加原因(参考上面"该写什么"一节)。
347
+
348
+ - **每阶段写 `logs/phase_<name>_complete.md`** —— 增量式,记录每个 phase 产出了什么、做了哪些决定、下个 phase 继承什么。适合"边发现边定型"的迭代式工作。E2E #7 GLM 用了这个模式:6 篇 phase 文档 + `evolution_summary_v1.2.md`,方法论照样捕获了,只是没写 PATTERNS.md。
349
+
350
+ - **`AGENT.md` decisions 段 + 领域笔记** —— 叙事风格,是关于"我们知道什么"和"为什么"的活文档。适合需要捕获丰富领域上下文的项目(法规、边缘案例、阈值、样本格式分布)。E2E #7 GLM 的 AGENT.md 里有法规生效日期、产品类型分类、阈值数值、样本格式数量 —— 完全 OK,是相同目标的不同惯用法。
351
+
352
+ 不该做的事:跳过持久化、只靠对话上下文活着。等你写到第 N 条 skill 还没把方法论写到磁盘时,你已经做了 N 个关于 verdict 形状、chunker 边界、worker tier 的隐式决定 —— 每条规则都从零推导,重构要碰 N 个文件而不是一个。
353
+
354
+ ❌ "等我有空再来记录这些洞察。"
355
+
356
+ ✅ "每次 phase 推进之前,把这一阶段学到的东西写到适合本项目惯用法的那个持久化文件里 —— 哪怕只是初稿。"
357
+
358
+ E2E 历史:
359
+ - E2E #6 v070 DS 在用户介入回退之后才写 PATTERNS.md。那之前每条 skill 的设计决定都各自固化,之后还要再碰一遍。v0.7.1 加了"PATTERNS.md FIRST"的引导。
360
+ - E2E #7 v071 DS 和 GLM 都没写 PATTERNS.md,但 GLM 写了 6 篇 phase 完成日志和一份内容详尽的 AGENT.md —— 方法论 *捕获了*,只是放在了不同文件里。v0.7.2 把更宽的原则写进 skill:推进之前先持久化,格式灵活。
361
+
362
+ 引擎从文件系统推导里程碑(v0.7.0 Group A)会按磁盘事实核验覆盖率,无论你怎么切分工作。TaskBoard 是你的草稿;磁盘才是契约;持久化文件是项目的记忆。