npm - kc-beta - Versions diffs - 0.7.0 → 0.7.2 - Mend

kc-beta 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/package.json +1 -1
package/src/agent/engine.js +35 -3
package/src/agent/pipelines/_milestone-derive.js +86 -8
package/src/agent/pipelines/extraction.js +27 -0
package/src/agent/pipelines/finalization.js +2 -6
package/src/agent/pipelines/initializer.js +13 -0
package/src/agent/pipelines/skill-testing.js +39 -0
package/src/agent/tools/phase-advance.js +20 -3
package/src/agent/tools/release.js +151 -1
package/src/util/kc-version.js +27 -0
package/template/skills/en/meta-meta/rule-extraction/SKILL.md +18 -0
package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +26 -0
package/template/skills/en/meta-meta/work-decomposition/SKILL.md +104 -1
package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +18 -0
package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +26 -0
package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +99 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "kc-beta",
-  "version": "0.7.0",
+  "version": "0.7.2",
   "description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
   "type": "module",
   "bin": {

package/src/agent/engine.js CHANGED Viewed

@@ -21,6 +21,7 @@ import { SnapshotTool } from "./tools/snapshot.js";
 import { ArchiveFileTool } from "./tools/archive-file.js";
 import { ScheduleFetchTool } from "./tools/schedule-fetch.js";
 import { ReleaseTool } from "./tools/release.js";
+import { readKcVersion } from "../util/kc-version.js";
 import { PhaseAdvanceTool } from "./tools/phase-advance.js";
 import { DocumentParseTool } from "./tools/document-parse.js";
 import { DocumentSearchTool } from "./tools/document-search.js";
@@ -421,9 +422,23 @@ export class AgentEngine {
         new SnapshotTool(this.workspace),
         new ArchiveFileTool(this.workspace),
         new ScheduleFetchTool(this.workspace),
-        new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
+        new ReleaseTool(this.workspace, { kcVersion: readKcVersion() }),
         new PhaseAdvanceTool(
-          (to, reason, opts) => this._advancePhase(to, reason, opts),
+          // v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
+          // so the tool's refusal text can surface the engine telemetry
+          // that motivated the refusal. Internal callers of
+          // `_advancePhase` continue to use the bool return value
+          // directly; only this lambda wraps for the LLM-facing tool.
+          (to, reason, opts) => {
+            const advanced = this._advancePhase(to, reason, opts);
+            if (!advanced) {
+              let engineCounts = null;
+              try { engineCounts = this._buildEngineCountsBlock(this.currentPhase); }
+              catch { /* defensive */ }
+              return { advanced: false, engineCounts };
+            }
+            return { advanced: true };
+          },
           () => this.currentPhase, // H1: tool reads phase BEFORE its own call
           // v0.6.2 J1: surface running subagents so the tool can refuse
           // advance until the agent explicitly acknowledges them.
@@ -1665,7 +1680,24 @@ export class AgentEngine {
           parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
           break;
         }
-        // bootstrap / finalization: no specific counters, fall through
+        case "bootstrap": {
+          // v0.7.2 1e: previously fell through to empty string. Both
+          // v0.7.1 audit runs had bootstrap → rule_extraction refusals
+          // with engineCounts: "" — agent saw the refusal but had no
+          // engine telemetry to react to. The InitializerPipeline tracks
+          // boolean checklist flags rather than numeric counters; we
+          // surface those flags as "yes/no" so the agent can see which
+          // bootstrap criterion is missing.
+          if (typeof pipeline.describeBootstrapChecklist === "function") {
+            const cl = pipeline.describeBootstrapChecklist();
+            parts.push(`workspaceCreated: ${cl.workspaceCreated ? "yes" : "no"}`);
+            parts.push(`configReady: ${cl.configReady ? "yes" : "no"}`);
+            parts.push(`hasRegulations: ${cl.hasRegulations ? "yes" : "no"}`);
+            parts.push(`hasSamples: ${cl.hasSamples ? "yes" : "no"}`);
+          }
+          break;
+        }
+        // finalization: no specific counters, fall through
       }
     } catch { /* never let summary build break phase advance */ }
     return parts.join(", ");

package/src/agent/pipelines/_milestone-derive.js CHANGED Viewed

@@ -57,16 +57,21 @@ function listChildFiles(p) {
 // Walk a directory recursively, yielding every file path. Skips hidden
 // dirs/files and __pycache__. Used by derive functions that need to
 // match arbitrarily-nested artifacts (e.g., scripts/ subdirs).
-function* walkFiles(root) {
+//
+// v0.7.2 1a: optional maxDepth caps recursion. depth=0 is root's
+// direct children; depth=1 is one level down. Default unbounded
+// (existing callers).
+function* walkFiles(root, { maxDepth } = {}) {
   if (!dirExists(root)) return;
-  const stack = [root];
+  const stack = [{ dir: root, depth: 0 }];
   while (stack.length) {
-    const dir = stack.pop();
+    const { dir, depth } = stack.pop();
     for (const e of readDirSafe(dir)) {
       if (e.name.startsWith(".") || e.name === "__pycache__") continue;
       const p = path.join(dir, e.name);
-      if (e.isDirectory()) stack.push(p);
-      else if (e.isFile()) yield p;
+      if (e.isDirectory()) {
+        if (maxDepth == null || depth < maxDepth) stack.push({ dir: p, depth: depth + 1 });
+      } else if (e.isFile()) yield p;
     }
   }
 }
@@ -250,7 +255,8 @@ export function deriveSkillAuthoringMilestones(workspace) {
 export function deriveSkillTestingMilestones(workspace) {
   const cwd = cwdOf(workspace);
   const skillsDir = path.join(cwd, "rule_skills");
-  const skillsTested = [];
+  // Use a Set so the v0.7.1 1a output/-side scan can add without duplicates.
+  const tested = new Set();
   if (dirExists(skillsDir)) {
     for (const e of listChildDirs(skillsDir)) {
@@ -266,14 +272,86 @@ export function deriveSkillTestingMilestones(workspace) {
         fileExists(path.join(skillPath, "assets", "test_cases.json")) ||
         listChildFiles(skillPath).some((f) =>
           /^(test|.*_test)_(output|result|log)/i.test(f.name) && f.name.endsWith(".json"));
-      if (hasTestArtifact) skillsTested.push(e.name);
+      if (hasTestArtifact) tested.add(e.name);
+    }
+  }
+  // v0.7.1 1a / v0.7.2 1a: credit rules whose verdicts appear anywhere
+  // under output/*.json. Agents persist batch-test results in
+  // conductor-specific shapes (this is the recurring drift point —
+  // engine derivation has to match disk reality, not the other way
+  // around). Shapes seen across E2E #5/6/7:
+  //
+  //   - DS v0.7.0/0.7.1: catalog.json as array of {id: "R001", ...}
+  //     entries; skill_test_*.json as {doc_name: {R019a: bool, ...}};
+  //     skill_test_阳光资产.json with {doc, results: {R019a: ...}}
+  //   - GLM v0.7.1: rule_stats.json as {D01-01: {PASS, FAIL, NA}, ...};
+  //     full_test_results_v[1-6].json as {sample_id: {path, meta,
+  //     results: {D01-01: {verdict, ...}}}} (nested 2 levels deep, why
+  //     v0.7.1's shallow walk missed them)
+  //
+  // The collector recurses (depth-limited) and uses two heuristics to
+  // separate rule_ids from sample_ids / doc_names:
+  //   1. Rule-id shape: starts with letter, ≤ 30 chars, contains digits
+  //      (matches R001, D01-01, T02-31; rejects 06f2ed1488, doc paths)
+  //   2. Verdict-shape on values: {verdict, passed, pass, PASS, FAIL}
+  //      keys signal that the parent dict's keys are rule_ids
+  const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
+  const isRuleIdShape = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
+  const looksLikeVerdict = (v) =>
+    v && typeof v === "object" && !Array.isArray(v) && (
+      v.verdict !== undefined ||
+      v.passed !== undefined ||
+      v.pass !== undefined ||
+      typeof v.PASS === "number" ||
+      typeof v.FAIL === "number"
+    );
+  const collectFromJsonFile = (data, depth = 0) => {
+    if (!data || depth > 4) return;
+    if (typeof data !== "object") return;
+    if (Array.isArray(data)) {
+      for (const r of data) collectFromJsonFile(r, depth + 1);
+      return;
+    }
+    // {rule_id: "X"} or {id: "R001"} on a rule entry
+    if (isRuleIdShape(data.rule_id)) tested.add(data.rule_id);
+    if (isRuleIdShape(data.id)) tested.add(data.id);
+    // {<rule_id>: <verdict_shaped>, ...}  (rule_stats / per-doc test_results)
+    for (const [k, v] of Object.entries(data)) {
+      if (isRuleIdShape(k) && looksLikeVerdict(v)) tested.add(k);
     }
+    // {results: {<rule_id>: ...}} — keys must look rule-id-shaped
+    if (data.results && typeof data.results === "object" && !Array.isArray(data.results)) {
+      for (const k of Object.keys(data.results)) {
+        if (isRuleIdShape(k)) tested.add(k);
+      }
+    }
+    // Recurse into nested objects (handles {sample_id: {results: {...}}})
+    for (const v of Object.values(data)) {
+      if (v && typeof v === "object") collectFromJsonFile(v, depth + 1);
+    }
+  };
+  const outputDir = path.join(cwd, "output");
+  for (const p of walkFiles(outputDir, { maxDepth: 6 })) {
+    if (!p.endsWith(".json")) continue;
+    collectFromJsonFile(readJsonSafe(p));
+  }
+  // DS v070 wrote a top-level aggregate at either rules/test_results.json
+  // OR rule_skills/test_results.json. Both seen in the wild; check both.
+  for (const candidate of [
+    path.join(cwd, "rules", "test_results.json"),
+    path.join(cwd, "rule_skills", "test_results.json"),
+    path.join(cwd, "test_results.json"),
+  ]) {
+    if (fileExists(candidate)) collectFromJsonFile(readJsonSafe(candidate));
   }
   // skillsPassing — per-skill accuracy threshold. Without a uniform
   // schema across agent outputs we report `tested` as the floor; the
   // pipeline's existing _loadTestResults() can layer accuracy on top.
-  return { skillsTested };
+  return { skillsTested: [...tested] };
 }
 // ───────────────────────────────────────────────────────────────────

package/src/agent/pipelines/extraction.js CHANGED Viewed

@@ -205,6 +205,33 @@ export class RuleExtractionPipeline extends Pipeline {
       return `workflow_run is SKILL_TESTING/PRODUCTION_QC-phase work, but engine is in RULE_EXTRACTION. ${exitText}`;
     }
+    // v0.7.1 2a/2b: when agent attempts phase_advance from rule_extraction,
+    // surface advisories for the two soft-but-load-bearing artifacts the
+    // gate criteria require (chunk_refs and coverage_audit). v0.7.0 GLM
+    // session forced through with both missing — gate refused for the
+    // right reason but the refusal text was generic. Name them inline.
+    if (toolName === "phase_advance" && toolInput?.to === "skill_authoring") {
+      const advisories = [];
+      if (this.rulesExtracted.length > 0 && this.rulesWithChunkRefs.length === 0) {
+        advisories.push(
+          `Advancing rule_extraction with rulesWithChunkRefs=0/${this.rulesExtracted.length}. ` +
+          `The skill_authoring phase's prompts use source_chunk_ids to ground ` +
+          `skill explanations against regulation text. Without them, skill authoring ` +
+          `runs blind. Either populate chunk refs via the rule_catalog tool, or ` +
+          `accept that skill_authoring's generated content won't cite source regulation.`,
+        );
+      }
+      if (this.rulesExtracted.length > 0 && !this.coverageAudited) {
+        advisories.push(
+          `Advancing rule_extraction without rules/coverage_audit.md (or .json). ` +
+          `Coverage audit identifies regulation articles you didn't extract a rule ` +
+          `for — without it, gaps go silent through to production. If your ` +
+          `extraction is genuinely complete, write a one-paragraph audit confirming so.`,
+        );
+      }
+      if (advisories.length > 0) return advisories.join("\n\n");
+    }
     return null;
   }

package/src/agent/pipelines/finalization.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { PipelineEvent } from "./index.js";
 import { Pipeline } from "./base.js";
 import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
 import { deriveFinalizationMilestones } from "./_milestone-derive.js";
+import { readKcVersion } from "../../util/kc-version.js";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 // v0.7.0 N: ship template/release/v1/ from the npm package; copy into
@@ -310,12 +311,7 @@ export class FinalizationPipeline extends Pipeline {
   }
   _readKcVersion() {
-    try {
-      const pkg = JSON.parse(fs.readFileSync(
-        path.resolve(__dirname, "../../../package.json"), "utf-8",
-      ));
-      return pkg.version || "unknown";
-    } catch { return "unknown"; }
+    return readKcVersion();
   }
   /**

package/src/agent/pipelines/initializer.js CHANGED Viewed

@@ -240,6 +240,19 @@ export class ProjectInitializer extends Pipeline {
     return this.workspaceCreated && this.configReady && this.hasRegulations && this.hasSamples;
   }
+  // v0.7.2 1e: surface the checklist as engine telemetry so
+  // `_buildEngineCountsBlock("bootstrap")` has something to report when
+  // bootstrap → rule_extraction is refused. Agent sees the missing
+  // criteria directly in the refusal text.
+  describeBootstrapChecklist() {
+    return {
+      workspaceCreated: !!this.workspaceCreated,
+      configReady: !!this.configReady,
+      hasRegulations: !!this.hasRegulations,
+      hasSamples: !!this.hasSamples,
+    };
+  }
   /**
    * v0.6.3 (#74): nudge the agent when it does work that belongs to a later
    * phase. Bootstrap is setup — reading rules/samples, configuring keys,

package/src/agent/pipelines/skill-testing.js CHANGED Viewed

@@ -14,6 +14,11 @@ export class SkillTestingPipeline extends Pipeline {
     this.iterationCount = 0;
     this._accuracyThreshold = 0.9;
     this._maxIterations = 20;
+    // v0.7.1 1b: rate-limit phaseMisfitHint firing for ephemeral
+    // sandbox tests. Caps at ~3 nudges per phase entry so the agent
+    // sees the path expectation but doesn't get spammed during a
+    // batch run.
+    this._misfit_nudge_count = 0;
     this._scanWorkspace();
   }
@@ -132,6 +137,12 @@ export class SkillTestingPipeline extends Pipeline {
    * v0.6.3 (#74): SKILL_TESTING runs check scripts against test samples and
    * measures accuracy. Writing distillation outputs or production results
    * here means phase boundaries got skipped.
+   *
+   * v0.7.1 1b: also nudges agents who run check scripts via sandbox_exec
+   * but don't persist verdicts. E2E #6 v070 surfaced this — both
+   * conductors batched tests in one sandbox_exec call, read pass/fail
+   * from stdout, then declared "testing done" while engine saw
+   * skillsTested=0 because nothing landed in a recognized path.
    */
   phaseMisfitHint(toolName, toolInput, result) {
     if (result?.isError) return null;
@@ -148,6 +159,34 @@ export class SkillTestingPipeline extends Pipeline {
         return `Writing under output/results/ is PRODUCTION_QC-phase work, but engine is in SKILL_TESTING. ${exitText}`;
       }
     }
+    // v0.7.1 1b: sandbox_exec test-command nudge
+    if (toolName === "sandbox_exec") {
+      const cmd = String(toolInput?.command || "");
+      const looksLikeTest =
+        /python.*check.*\.py.*\.(txt|pdf|md|docx)/i.test(cmd) ||
+        /pytest|unittest|run_tests/i.test(cmd) ||
+        /python.*workflow.*\.py.*samples/i.test(cmd);
+      if (!looksLikeTest) return null;
+      const tested = Object.keys(this.skillsTested).length;
+      const total = this.skillsToTest.length;
+      // Already satisfied? Don't nudge.
+      if (total === 0 || tested >= total) return null;
+      // Rate-limit: ~3 per phase. Counter resets on phase entry
+      // (constructor) and on importState if available.
+      this._misfit_nudge_count = (this._misfit_nudge_count || 0) + 1;
+      if (this._misfit_nudge_count > 3) return null;
+      return (
+        `Engine derives skillsTested from rule_skills/<id>/test_results.json, ` +
+        `rule_skills/<id>/tests/, OR output/*.json with rule_id field. ` +
+        `Sandbox runs are ephemeral — record per-rule verdicts to one of ` +
+        `those paths before phase_advance. Currently engine sees ` +
+        `${tested}/${total} skills tested.`
+      );
+    }
     return null;
   }

package/src/agent/tools/phase-advance.js CHANGED Viewed

@@ -15,7 +15,11 @@ const VALID_PHASES = new Set(Object.values(Phase));
  */
 export class PhaseAdvanceTool extends BaseTool {
   /**
-   * @param {(to: string, reason: string, opts: {force?: boolean}) => boolean} advanceFn
+   * @param {(to: string, reason: string, opts: {force?: boolean}) => {advanced: boolean, engineCounts?: string}} advanceFn
+   *   v0.7.1 2c: returns the rich object so the tool can surface engine
+   *   telemetry in the refusal text. Internal engine callers of
+   *   `_advancePhase` still get the bool; only this LLM-facing tool
+   *   uses the wrapped form.
    * @param {() => string} getCurrentPhaseFn - H1: lets the tool read the
    *   engine's phase BEFORE the call, so it can distinguish "already there"
    *   (silent no-op, informational) from "non-adjacent refusal" (actionable).
@@ -91,7 +95,11 @@ export class PhaseAdvanceTool extends BaseTool {
       );
     }
-    const advanced = this._advance(to, input.reason || "agent request", { force: !!input.force });
+    // v0.7.1 2c: advanceFn returns {advanced, engineCounts?} so we can
+    // surface telemetry in the refusal text below. Internal callers of
+    // _advancePhase still get bool; only this LLM-facing tool unwraps.
+    const advanceResult = this._advance(to, input.reason || "agent request", { force: !!input.force });
+    const advanced = !!advanceResult?.advanced;
     if (advanced) {
       // Log the ack so post-mortems can find phase advances that proceeded
       // with live subagents
@@ -113,9 +121,18 @@ export class PhaseAdvanceTool extends BaseTool {
     // immediately (12/12 transitions). The escape valve remains in the input
     // schema (discoverable) but isn't hand-fed to the LLM here. Instead,
     // direct the agent at the missing milestones it can satisfy.
+    //
+    // v0.7.1 2c: include engineCounts when available so the agent sees
+    // exactly which milestones the gate is reading and can satisfy them.
+    // E2E #6 v070 showed the generic "check /status" hint wasn't concrete
+    // enough — agents forced through. Naming the gap inline reduces that.
+    const engineCountsLine = advanceResult?.engineCounts
+      ? `\nEngine telemetry: ${advanceResult.engineCounts}`
+      : "";
     return new ToolResult(
       `Did not advance to ${to} (currently in ${beforePhase || "?"}). ` +
-      `Likely cause: source-phase exit criteria not met. ` +
+      `Likely cause: source-phase exit criteria not met.${engineCountsLine}\n\n` +
       `Run /status (or read the phase describeState block in this turn's system reminder) ` +
       `to see which milestones are missing, then produce the disk artifacts that satisfy them — ` +
       `the engine derives milestones from filesystem facts (rule_skills/<id>/SKILL.md, check.py, ` +

package/src/agent/tools/release.js CHANGED Viewed

@@ -178,7 +178,21 @@ export class ReleaseTool extends BaseTool {
                        path.join(bundleAbs, "glossary.json"), { fallback: '{"version":1,"entries":[]}\n' });
     this._copyIfExists(path.join(this._workspace.cwd, "corner_cases.json"),
                        path.join(bundleAbs, "corner_cases.json"), { fallback: '[]\n' });
-    this._copyIfExists(path.join(this._workspace.cwd, "confidence_calibration.json"),
+    // v0.7.2 1c: auto-aggregate from output/ if no calibration file at
+    // workspace root. Both v0.7.1 audit runs (DS + GLM) shipped releases
+    // with empty `historical_accuracy: {}` despite having per-rule QC
+    // data on disk under output/ — the release tool just passed the
+    // file through and emitted a stub on miss. We try to populate from
+    // known QC artifact shapes here; if nothing matches, fall through
+    // to the existing stub fallback.
+    const calibSrc = path.join(this._workspace.cwd, "confidence_calibration.json");
+    if (!fs.existsSync(calibSrc)) {
+      const aggregated = this._aggregateAccuracyFromOutput();
+      if (aggregated && Object.keys(aggregated.historical_accuracy).length > 0) {
+        fs.writeFileSync(calibSrc, JSON.stringify(aggregated, null, 2) + "\n", "utf-8");
+      }
+    }
+    this._copyIfExists(calibSrc,
                        path.join(bundleAbs, "confidence_calibration.json"),
                        { fallback: '{"historical_accuracy":{}}\n' });
@@ -233,6 +247,30 @@ export class ReleaseTool extends BaseTool {
       .replace(/\{RULES_LIST\}/g, rulesList);
     fs.writeFileSync(path.join(bundleAbs, "README.md"), readme, "utf-8");
+    // v0.7.2 1d: clean up the template scaffold dir if a customized
+    // release was just written alongside it. Both v0.7.1 audit runs
+    // shipped with `output/releases/v1/` (template-derived, .tmpl
+    // files lingering) AND `output/releases/v1-0/` (or v1-0-hybrid/)
+    // — the customized release. The pre-scaffold is meant as a hint;
+    // once the agent calls `release(label="v1-0")` and we've written
+    // the real bundle, the unedited scaffold is just clutter.
+    //
+    // Conservative gate: only delete a sibling `v1/` if BOTH (a) we
+    // didn't just write to v1/ ourselves, AND (b) it still contains
+    // .tmpl files (signature of unedited template). If the agent
+    // intentionally edited v1/ in place (removing .tmpl), our cleanup
+    // leaves it alone.
+    if (slug !== "v1") {
+      const tmplScaffold = path.join(this._workspace.resolvePath(path.join("output", "releases")), "v1");
+      if (fs.existsSync(tmplScaffold) && fs.statSync(tmplScaffold).isDirectory()) {
+        let hasTmpl = false;
+        try { hasTmpl = fs.readdirSync(tmplScaffold).some((f) => f.endsWith(".tmpl")); } catch { /* ignore */ }
+        if (hasTmpl) {
+          try { fs.rmSync(tmplScaffold, { recursive: true, force: true }); } catch { /* best-effort */ }
+        }
+      }
+    }
     // Bundle dir is in output/ (gitignored). Snapshot manifest in snapshots/ IS tracked.
     const lines = [
       `Release '${label}' bundled at ${bundleRel}`,
@@ -319,6 +357,118 @@ export class ReleaseTool extends BaseTool {
     return null;
   }
+  // v0.7.2 1c: walk output/ for QC artifacts and aggregate per-rule
+  // accuracy. Recognized shapes (covering DS + GLM v0.7.1 audit runs):
+  //
+  //   rule_stats_v*.json — {<rule_id>: {PASS: N, FAIL: N, NOT_APPLICABLE: N, ERROR: N}}
+  //     (GLM produced 4 versions; pick the highest)
+  //   full_test_results_v*.json — {<sample_id>: {results: {<rule_id>: {verdict}}}}
+  //     (GLM; accumulate verdicts per rule across samples)
+  //   skill_test_*.json — {<doc_name>: {<rule_id>: bool}} (DS shape)
+  //
+  // Returns null if no recognized artifact, or an object with
+  //   { historical_accuracy: {<rule_id>: {pass_rate, n_samples, ...}}, computed_at, source_files }
+  // suitable for confidence_calibration.json.
+  _aggregateAccuracyFromOutput() {
+    const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
+    const isRuleId = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
+    const tally = new Map();  // rule_id -> {pass, fail, na, n}
+    const sourceFiles = [];
+    const bump = (rid, kind) => {
+      if (!isRuleId(rid)) return;
+      const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
+      t[kind] += 1;
+      t.n += 1;
+      tally.set(rid, t);
+    };
+    const outputDir = path.join(this._workspace.cwd, "output");
+    if (!fs.existsSync(outputDir)) return null;
+    // Collect all .json files under output/ (depth limited)
+    const files = [];
+    const stack = [{ dir: outputDir, depth: 0 }];
+    while (stack.length) {
+      const { dir, depth } = stack.pop();
+      let entries;
+      try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { continue; }
+      for (const e of entries) {
+        if (e.name.startsWith(".") || e.name === "__pycache__") continue;
+        const p = path.join(dir, e.name);
+        if (e.isDirectory()) {
+          if (depth < 6) stack.push({ dir: p, depth: depth + 1 });
+        } else if (e.isFile() && e.name.endsWith(".json")) {
+          files.push({ path: p, name: e.name });
+        }
+      }
+    }
+    // 1) Prefer rule_stats_v<N>.json (highest version) — direct counts
+    const ruleStatsFiles = files
+      .filter((f) => /^rule_stats(?:_v\d+)?\.json$/i.test(f.name))
+      .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
+      .sort((a, b) => b.ver - a.ver);
+    if (ruleStatsFiles.length > 0) {
+      const top = ruleStatsFiles[0];
+      try {
+        const d = JSON.parse(fs.readFileSync(top.path, "utf-8"));
+        for (const [rid, stats] of Object.entries(d)) {
+          if (!isRuleId(rid) || !stats || typeof stats !== "object") continue;
+          const pass = stats.PASS | 0, fail = stats.FAIL | 0;
+          const na = stats.NOT_APPLICABLE | stats.NA | 0;
+          const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
+          t.pass += pass; t.fail += fail; t.na += na; t.n += pass + fail + na;
+          tally.set(rid, t);
+        }
+        sourceFiles.push(path.relative(this._workspace.cwd, top.path));
+      } catch { /* fall through to other shapes */ }
+    }
+    // 2) Fallback: full_test_results*.json with nested {sample_id: {results: {rid: {verdict}}}}
+    if (tally.size === 0) {
+      const ftrFiles = files
+        .filter((f) => /^full_test_results(?:_v\d+)?\.json$/i.test(f.name))
+        .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
+        .sort((a, b) => b.ver - a.ver);
+      for (const f of ftrFiles.slice(0, 1)) {
+        try {
+          const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
+          for (const sample of Object.values(d)) {
+            if (!sample || typeof sample !== "object") continue;
+            const results = sample.results;
+            if (!results || typeof results !== "object") continue;
+            for (const [rid, r] of Object.entries(results)) {
+              if (!isRuleId(rid) || !r || typeof r !== "object") continue;
+              const verdict = (r.verdict || "").toString().toUpperCase();
+              if (verdict === "PASS") bump(rid, "pass");
+              else if (verdict === "FAIL") bump(rid, "fail");
+              else if (verdict === "NOT_APPLICABLE" || verdict === "NA") bump(rid, "na");
+            }
+          }
+          sourceFiles.push(path.relative(this._workspace.cwd, f.path));
+        } catch { /* try next shape */ }
+      }
+    }
+    if (tally.size === 0) return null;
+    const historical_accuracy = {};
+    for (const [rid, t] of tally.entries()) {
+      const fired = t.pass + t.fail;
+      historical_accuracy[rid] = {
+        pass_rate: fired > 0 ? +(t.pass / fired).toFixed(4) : null,
+        n_passed: t.pass,
+        n_failed: t.fail,
+        n_not_applicable: t.na,
+        n_samples: t.n,
+      };
+    }
+    return {
+      historical_accuracy,
+      computed_at: new Date().toISOString(),
+      source_files: sourceFiles,
+    };
+  }
   _readWorkerTiers() {
     const envPath = path.join(this._workspace.cwd, ".env");
     const out = { tier1: "", tier2: "", tier3: "", tier4: "" };

package/src/util/kc-version.js ADDED Viewed

@@ -0,0 +1,27 @@
+// Single source of truth for the live KC CLI version string.
+//
+// Reads package.json once. Used by engine.js (passed to ReleaseTool so
+// release manifests stamp the correct version) and by
+// pipelines/finalization.js (anywhere it surfaces "Built by kc-beta X").
+//
+// Before v0.7.2, engine.js hardcoded `kcVersion: "0.5.2"` which leaked
+// into every release manifest's `kc_beta_version` field regardless of
+// the actual package version. Both v0.7.1 audit runs (DS + GLM)
+// surfaced this. Reading package.json closes the gap.
+import fs from "node:fs";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+export function readKcVersion() {
+  try {
+    const pkgPath = path.resolve(__dirname, "../../package.json");
+    const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
+    return pkg.version || "unknown";
+  } catch {
+    return "unknown";
+  }
+}

package/template/skills/en/meta-meta/rule-extraction/SKILL.md CHANGED Viewed

@@ -223,6 +223,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
 Do not skip ambiguous rules. They are often the most important ones.
+## Sanity-check applicability against the sample corpus
+After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
+For every rule:
+1. Walk `samples/`, classify each by product type / report type / document format
+2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
+3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
+E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
+If many rules are 0-sample, either:
+- **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
+- **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
+- **Update the test corpus** to include matching samples (work with the developer user)
+Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
 ## When Rules Change
 Regulations evolve. When the developer user adds new or updated regulation documents:

package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md CHANGED Viewed

@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
 ### The hybrid approach (most common)
 Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
+### When regex alone isn't enough — decision rubric
+Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
+- **Semantic** ("is this a positive guarantee or a disclaimer?")
+- **Contextual** ("interpret this in light of the document's product type")
+- **Counterfactual** ("what should this value be, given the other fields?")
+- **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
+regex alone rarely suffices. Three acceptable forms:
+1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
+2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
+3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
+Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
+### Worker LLM cost-aware tier choice
+If you do escalate to LLM:
+- **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
+- **tier2-3**: bulk extraction with simple semantic checks
+- **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
+Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
 ## Workflow Structure
 A workflow is a Python file (or small set of files) in `workflows/`:

package/template/skills/en/meta-meta/work-decomposition/SKILL.md CHANGED Viewed

@@ -101,6 +101,87 @@ The v0.6.2 D2 anti-pattern wording captures the failure case clearly:
 That came from E2E #4 where one conductor wrote a 2,400-line `unified_qc.py` that ran all rules at once. It produced 1,150 ERROR verdicts (16.6%) because every rule's failure cascaded into every other rule's verdict. Per-rule skills are KC's unit of granularity for a reason.
+### Anti-pattern: stub check.py + real workflow.py
+Do NOT make `rule_skills/<id>/check.py` a stub that defers to
+`workflows/<id>/workflow.py`. KC's intent: SKILL.md + check.py is the
+**canonical** verification. workflow.py is the **distilled, cheaper**
+form (regex baseline + LLM fallback). The relationship is
+skill → workflow, not workflow → skill.
+❌ DON'T:
+```python
+# rule_skills/R001/check.py — STUB, real logic elsewhere
+def check(text):
+    rule_ids = re.findall(r"R\d{3}", load_skill())
+    return {rid: {"pass": None, "method": "stub",
+                  "note": "to be implemented later"} for rid in rule_ids}
+# real verification logic only in workflows/R001/workflow_v1.py
+```
+✅ DO:
+```python
+# rule_skills/R001/check.py — canonical verification
+def check(text):
+    matches = re.findall(r"...", text)  # actual rule logic
+    return {"rule_id": "R001", "passed": bool(matches),
+            "evidence": matches[:3], "method": "regex"}
+# workflows/R001/workflow_v1.py — distilled, cheaper form
+def run(text, llm_fn=None):
+    result = check(text)             # baseline from skill
+    if not result["passed"] and llm_fn:
+        result = llm_verify(text, llm_fn)  # escalate on fail
+    return result
+```
+Why it matters: distillation phase consumers (release tool, run.py
+harness) load workflow.py. If check.py is a stub, the skill's
+methodology (SKILL.md) becomes documentation-only and the
+verification logic is scattered across N workflow files. Future
+iterations of the skill (changes to regulation interpretation, edge
+cases discovered in production) need a single canonical place to
+update — the skill — not N workflows that have drifted independently.
+E2E #6 v070 surfaced this pattern (DS bundled-skill check.py files
+all returned `{"pass": null, "method": "stub"}` deferring to
+workflows/). v0.7.1 added this anti-pattern explicitly.
+E2E #7 v071 showed the teaching prevented the stub anti-pattern in
+both conductors (no `{"pass": null}` patterns in either run), but
+**DS still inverted the canonical-vs-distilled relationship**: DS's
+6 thematic skill folders had SKILL.md only (no check.py), with the
+real verification code living in `workflows/<skill>/check.py`. The
+absence of stubs is good; the inversion is not — editing a rule then
+requires touching both SKILL.md (the doc) and the workflow check.py
+(the code). Single source of truth is lost.
+GLM v071 by contrast landed the canonical pattern: 97/97 skills had
+both SKILL.md AND a real `check.py` (median 143 LOC of regex +
+applicability logic), and `workflows/<id>/workflow_v1.py` was a
+50-line thin wrapper that imported and called it:
+```python
+# workflows/D01-01/workflow_v1.py — thin wrapper, 52 LOC
+import importlib.util, json
+from pathlib import Path
+def run(doc_text: str, meta: dict = None) -> dict:
+    check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
+    spec = importlib.util.spec_from_file_location("check", check_path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    result = mod.check(doc_text, meta)
+    result["_workflow"] = "D01-01_v1"
+    return result
+```
+This is the v0.7.2+ canonical pattern: workflow is a shim that
+points at the skill's check.py. To iterate on a rule's verification,
+edit `rule_skills/<id>/check.py`. The workflow doesn't change. v0.7.2
+clarifies the teaching: avoid stubs AND keep the canonical
+relationship (skill is canonical, workflow is distilled wrapper).
 ### Naming convention for grouped checks
 When you do bundle, name the file with the explicit range:
@@ -263,4 +344,26 @@ When entering skill_authoring with an empty TaskBoard:
 5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
 6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
-The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.
+### Persisted methodology — PATTERNS.md OR phase logs OR AGENT.md decisions
+The principle: capture framework-level decisions to disk before each phase advance. The conversation will compact, agents will restart, the next phase will lose grounding. Whichever format you pick, write to disk — don't rely on conversation context that disappears.
+Three formats, each defensible. Pick one and stick with it:
+- **`rules/PATTERNS.md`** — concise, framework-only, updated as the project progresses. Best for greenfield projects with clear hypothesis-up-front structure. Capped at ~5 KB; entries are transferable shapes / project constraints / anti-patterns with rationale (see "What to write" above).
+- **`logs/phase_<name>_complete.md` per phase** — incremental, captures what each phase produced + decisions made + what the next phase inherits. Best for iterative discovery work where the framework crystallizes mid-run. E2E #7 GLM used this pattern across 6 phase docs and an `evolution_summary_v1.2.md`; the methodology was captured even though PATTERNS.md was never written.
+- **`AGENT.md` decisions section + domain notes** — narrative-style, living document of "what we know" and "why". Best for projects with rich domain context to capture (regulations, edge cases, thresholds, sample format distributions). E2E #7 GLM's AGENT.md included regulation enforcement dates, product type taxonomies, threshold values, and sample format counts — this is fine; it's a different idiom for the same goal.
+What you should NOT do: skip persistence and rely only on the live conversation context. By the time you have N skills authored without any persisted methodology, you've made N implicit decisions about verdict shape, chunker boundaries, and worker tier. Each rule re-derives from scratch. Refactoring requires touching N files instead of one.
+❌ "I'll capture insights when I have time."
+✅ "Before each phase advance, write what I learned to whichever persistence file matches this project's idiom — even if it's tentative."
+E2E history:
+- E2E #6 v070 DS wrote PATTERNS.md only after a rollback. Per-skill decisions before that point had to be re-touched. v0.7.1 added "PATTERNS.md FIRST" reinforcement.
+- E2E #7 v071 neither DS nor GLM wrote PATTERNS.md, but GLM wrote 6 rich phase-completion logs and a comprehensive AGENT.md — the methodology WAS captured, just in different files. v0.7.2 blesses the broader principle: persist before you advance, format flexible.
+The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract; the persistence file is your project's memory.

package/template/skills/zh/meta-meta/rule-extraction/SKILL.md CHANGED Viewed

@@ -222,6 +222,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
 Do not skip ambiguous rules. They are often the most important ones.
+## Sanity-check applicability against the sample corpus
+After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
+For every rule:
+1. Walk `samples/`, classify each by product type / report type / document format
+2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
+3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
+E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
+If many rules are 0-sample, either:
+- **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
+- **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
+- **Update the test corpus** to include matching samples (work with the developer user)
+Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
 ## When Rules Change
 Regulations evolve. When the developer user adds new or updated regulation documents:

package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md CHANGED Viewed

@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
 ### The hybrid approach (most common)
 Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
+### When regex alone isn't enough — decision rubric
+Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
+- **Semantic** ("is this a positive guarantee or a disclaimer?")
+- **Contextual** ("interpret this in light of the document's product type")
+- **Counterfactual** ("what should this value be, given the other fields?")
+- **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
+regex alone rarely suffices. Three acceptable forms:
+1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
+2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
+3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
+Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
+### Worker LLM cost-aware tier choice
+If you do escalate to LLM:
+- **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
+- **tier2-3**: bulk extraction with simple semantic checks
+- **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
+Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
 ## Workflow Structure
 A workflow is a Python file (or small set of files) in `workflows/`:

package/template/skills/zh/meta-meta/work-decomposition/SKILL.md CHANGED Viewed

@@ -101,6 +101,82 @@ v0.6.2 D2 的反模式说法已经把失败情形说得很清楚了：
 那段话来自 E2E #4：一个指挥模型写了 2,400 行 `unified_qc.py` 一次性跑所有规则。结果出现 1,150 条 ERROR verdict（16.6%），因为每条规则的失败都连带把所有其他规则的判定也带崩了。Per-rule skill 是 KC 的粒度单元，这是有原因的。
+### 反模式：check.py 是 stub + workflow.py 才是真逻辑
+**不要**把 `rule_skills/<id>/check.py` 写成一个把真实逻辑推迟到
+`workflows/<id>/workflow.py` 的占位文件。KC 的设计意图是：SKILL.md
++ check.py 是**正典**核查；workflow.py 是**蒸馏后、更便宜**的形式
+（regex 优先 + LLM 回退）。关系是 skill → workflow，不是反过来。
+❌ 不要这样：
+```python
+# rule_skills/R001/check.py —— STUB，真逻辑在别处
+def check(text):
+    rule_ids = re.findall(r"R\d{3}", load_skill())
+    return {rid: {"pass": None, "method": "stub",
+                  "note": "待技能测试阶段实现"} for rid in rule_ids}
+# 实际核查逻辑只在 workflows/R001/workflow_v1.py 里
+```
+✅ 应该这样：
+```python
+# rule_skills/R001/check.py —— 正典核查
+def check(text):
+    matches = re.findall(r"...", text)  # 真实规则逻辑
+    return {"rule_id": "R001", "passed": bool(matches),
+            "evidence": matches[:3], "method": "regex"}
+# workflows/R001/workflow_v1.py —— 蒸馏后的便宜形式
+def run(text, llm_fn=None):
+    result = check(text)             # skill 提供基线
+    if not result["passed"] and llm_fn:
+        result = llm_verify(text, llm_fn)  # FAIL 时升级到 LLM
+    return result
+```
+为什么重要：蒸馏阶段下游消费者（release 工具、run.py 运行器）加载
+的是 workflow.py。如果 check.py 是 stub，skill 的方法论（SKILL.md）
+就只剩文档作用，而核查逻辑被分散到 N 个 workflow 文件里。后续对
+skill 的迭代（法规解释变化、生产中发现的边缘情形）需要一个**正典
+位置**来更新——也就是 skill——而不是 N 个已经各自漂移的 workflow。
+E2E #6 v070 暴露了这个反模式（DS 把所有 bundled skill 的 check.py
+都写成 `{"pass": null, "method": "stub"}` 推给 workflows/）。
+v0.7.1 把这个反模式显式写进 skill。
+E2E #7 v071 显示这个反 stub 的引导在两个 conductor 上都生效（两条 run
+里都没有 `{"pass": null}` 这种 stub 模式），但是 **DS 仍然把"正典 vs
+蒸馏"的关系搞反了**：DS 写了 6 个主题分组的 skill 文件夹，每个只有
+SKILL.md（没有 check.py），真正的验证代码却在
+`workflows/<skill>/check.py` 里。没有 stub 是好事；关系搞反不是 ——
+要修改一条规则的逻辑就得同时改 SKILL.md（文档）和 workflow check.py
+（代码），单一信息源就丢了。
+GLM v071 反而把正典模式落地了：97/97 个 skill 都同时有 SKILL.md 和
+真正的 `check.py`（regex + 适用性判断的代码，中位 143 行），而
+`workflows/<id>/workflow_v1.py` 是一个 50 行的薄壳，只是 import 并
+调用 skill 的 check.py：
+```python
+# workflows/D01-01/workflow_v1.py — 薄壳，52 行
+import importlib.util, json
+from pathlib import Path
+def run(doc_text: str, meta: dict = None) -> dict:
+    check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
+    spec = importlib.util.spec_from_file_location("check", check_path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    result = mod.check(doc_text, meta)
+    result["_workflow"] = "D01-01_v1"
+    return result
+```
+这是 v0.7.2+ 的正典模式：workflow 是个壳，指向 skill 的 check.py。
+迭代规则验证逻辑时，编辑 `rule_skills/<id>/check.py`，workflow 不用动。
+v0.7.2 把引导说得更清楚：既不要 stub，也要保留正典关系（skill 是
+正典，workflow 是蒸馏过的薄壳）。
 ### 合并 check 的命名约定
 确实需要合并时，文件名要把范围写明：
@@ -261,4 +337,26 @@ PATTERNS.md 全文控制在约 5 KB 之内。超过时，剪掉最不可执行
 5. **挑第一个任务**。做到完整（skill + check + 至少一次本地测试）。把学到的写进 PATTERNS.md。换下一个任务。
 6. **任务做到第 5 个、第 10 个时**：停下来重读 PATTERNS.md。如果新积累的 pattern 暗示要重构早期工作，**现在做**（便宜）而不是更晚（昂贵）。
-引擎从文件系统推导里程碑（v0.7.0 Group A）会按磁盘事实核验覆盖率，无论你怎么切分工作。TaskBoard 是你的草稿；磁盘才是契约。
+### 持久化方法论 —— PATTERNS.md 或 phase 日志 或 AGENT.md decisions
+原则：在每次 phase 推进之前，把框架级的决定写到磁盘。对话会被 compact、agent 会重启、下一个 phase 会失去上下文。不管你选哪种格式，**写到磁盘** —— 不要依赖会消失的对话上下文。
+三种格式都站得住，挑一种坚持下去：
+- **`rules/PATTERNS.md`** —— 简洁，只装框架级内容，随项目推进而更新。适合假设可以前置、结构清晰的全新项目。上限 ~5 KB；条目是可迁移的形状 / 项目级约束 / 反模式加原因（参考上面"该写什么"一节）。
+- **每阶段写 `logs/phase_<name>_complete.md`** —— 增量式，记录每个 phase 产出了什么、做了哪些决定、下个 phase 继承什么。适合"边发现边定型"的迭代式工作。E2E #7 GLM 用了这个模式：6 篇 phase 文档 + `evolution_summary_v1.2.md`，方法论照样捕获了，只是没写 PATTERNS.md。
+- **`AGENT.md` decisions 段 + 领域笔记** —— 叙事风格，是关于"我们知道什么"和"为什么"的活文档。适合需要捕获丰富领域上下文的项目（法规、边缘案例、阈值、样本格式分布）。E2E #7 GLM 的 AGENT.md 里有法规生效日期、产品类型分类、阈值数值、样本格式数量 —— 完全 OK，是相同目标的不同惯用法。
+不该做的事：跳过持久化、只靠对话上下文活着。等你写到第 N 条 skill 还没把方法论写到磁盘时，你已经做了 N 个关于 verdict 形状、chunker 边界、worker tier 的隐式决定 —— 每条规则都从零推导，重构要碰 N 个文件而不是一个。
+❌ "等我有空再来记录这些洞察。"
+✅ "每次 phase 推进之前，把这一阶段学到的东西写到适合本项目惯用法的那个持久化文件里 —— 哪怕只是初稿。"
+E2E 历史：
+- E2E #6 v070 DS 在用户介入回退之后才写 PATTERNS.md。那之前每条 skill 的设计决定都各自固化，之后还要再碰一遍。v0.7.1 加了"PATTERNS.md FIRST"的引导。
+- E2E #7 v071 DS 和 GLM 都没写 PATTERNS.md，但 GLM 写了 6 篇 phase 完成日志和一份内容详尽的 AGENT.md —— 方法论 *捕获了*，只是放在了不同文件里。v0.7.2 把更宽的原则写进 skill：推进之前先持久化，格式灵活。
+引擎从文件系统推导里程碑（v0.7.0 Group A）会按磁盘事实核验覆盖率，无论你怎么切分工作。TaskBoard 是你的草稿；磁盘才是契约；持久化文件是项目的记忆。