npm - kc-beta - Versions diffs - 0.7.1 → 0.7.2 - Mend

kc-beta 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/package.json +1 -1
package/src/agent/engine.js +20 -2
package/src/agent/pipelines/_milestone-derive.js +65 -42
package/src/agent/pipelines/finalization.js +2 -6
package/src/agent/pipelines/initializer.js +13 -0
package/src/agent/tools/release.js +151 -1
package/src/util/kc-version.js +27 -0
package/template/skills/en/meta-meta/rule-extraction/SKILL.md +18 -0
package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md +26 -0
package/template/skills/en/meta-meta/work-decomposition/SKILL.md +51 -8
package/template/skills/zh/meta-meta/rule-extraction/SKILL.md +18 -0
package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md +26 -0
package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +49 -8

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "kc-beta",
-  "version": "0.7.1",
+  "version": "0.7.2",
   "description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
   "type": "module",
   "bin": {

package/src/agent/engine.js CHANGED Viewed

@@ -21,6 +21,7 @@ import { SnapshotTool } from "./tools/snapshot.js";
 import { ArchiveFileTool } from "./tools/archive-file.js";
 import { ScheduleFetchTool } from "./tools/schedule-fetch.js";
 import { ReleaseTool } from "./tools/release.js";
+import { readKcVersion } from "../util/kc-version.js";
 import { PhaseAdvanceTool } from "./tools/phase-advance.js";
 import { DocumentParseTool } from "./tools/document-parse.js";
 import { DocumentSearchTool } from "./tools/document-search.js";
@@ -421,7 +422,7 @@ export class AgentEngine {
         new SnapshotTool(this.workspace),
         new ArchiveFileTool(this.workspace),
         new ScheduleFetchTool(this.workspace),
-        new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
+        new ReleaseTool(this.workspace, { kcVersion: readKcVersion() }),
         new PhaseAdvanceTool(
           // v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
           // so the tool's refusal text can surface the engine telemetry
@@ -1679,7 +1680,24 @@ export class AgentEngine {
           parts.push(`monitoring: ${pipeline.monitoringPhase ?? "?"}`);
           break;
         }
-        // bootstrap / finalization: no specific counters, fall through
+        case "bootstrap": {
+          // v0.7.2 1e: previously fell through to empty string. Both
+          // v0.7.1 audit runs had bootstrap → rule_extraction refusals
+          // with engineCounts: "" — agent saw the refusal but had no
+          // engine telemetry to react to. The InitializerPipeline tracks
+          // boolean checklist flags rather than numeric counters; we
+          // surface those flags as "yes/no" so the agent can see which
+          // bootstrap criterion is missing.
+          if (typeof pipeline.describeBootstrapChecklist === "function") {
+            const cl = pipeline.describeBootstrapChecklist();
+            parts.push(`workspaceCreated: ${cl.workspaceCreated ? "yes" : "no"}`);
+            parts.push(`configReady: ${cl.configReady ? "yes" : "no"}`);
+            parts.push(`hasRegulations: ${cl.hasRegulations ? "yes" : "no"}`);
+            parts.push(`hasSamples: ${cl.hasSamples ? "yes" : "no"}`);
+          }
+          break;
+        }
+        // finalization: no specific counters, fall through
       }
     } catch { /* never let summary build break phase advance */ }
     return parts.join(", ");

package/src/agent/pipelines/_milestone-derive.js CHANGED Viewed

@@ -57,16 +57,21 @@ function listChildFiles(p) {
 // Walk a directory recursively, yielding every file path. Skips hidden
 // dirs/files and __pycache__. Used by derive functions that need to
 // match arbitrarily-nested artifacts (e.g., scripts/ subdirs).
-function* walkFiles(root) {
+//
+// v0.7.2 1a: optional maxDepth caps recursion. depth=0 is root's
+// direct children; depth=1 is one level down. Default unbounded
+// (existing callers).
+function* walkFiles(root, { maxDepth } = {}) {
   if (!dirExists(root)) return;
-  const stack = [root];
+  const stack = [{ dir: root, depth: 0 }];
   while (stack.length) {
-    const dir = stack.pop();
+    const { dir, depth } = stack.pop();
     for (const e of readDirSafe(dir)) {
       if (e.name.startsWith(".") || e.name === "__pycache__") continue;
       const p = path.join(dir, e.name);
-      if (e.isDirectory()) stack.push(p);
-      else if (e.isFile()) yield p;
+      if (e.isDirectory()) {
+        if (maxDepth == null || depth < maxDepth) stack.push({ dir: p, depth: depth + 1 });
+      } else if (e.isFile()) yield p;
     }
   }
 }
@@ -271,48 +276,66 @@ export function deriveSkillTestingMilestones(workspace) {
     }
   }
-  // v0.7.1 1a: also credit rules whose verdicts appear in output/*.json.
-  // Agents naturally write batch-test results to output/, not per-skill
-  // paths. v0.6.x's _loadTestResults already reads here on the canonical
-  // accuracy schema; this expands the helper-derived milestone to
-  // recognize the same shape (plus the GLM/DS-shape variants seen in
-  // E2E #6 v070). Without this, agents who run tests via sandbox_exec
-  // and persist to output/ saw skillsTested=0 and force-bypassed.
-  const collectFromJsonFile = (data) => {
-    if (!data) return;
-    if (data.rule_id) tested.add(data.rule_id);
-    if (Array.isArray(data) && data[0] && typeof data[0] === "object" && data[0].rule_id) {
-      for (const r of data) if (r?.rule_id) tested.add(r.rule_id);
+  // v0.7.1 1a / v0.7.2 1a: credit rules whose verdicts appear anywhere
+  // under output/*.json. Agents persist batch-test results in
+  // conductor-specific shapes (this is the recurring drift point —
+  // engine derivation has to match disk reality, not the other way
+  // around). Shapes seen across E2E #5/6/7:
+  //
+  //   - DS v0.7.0/0.7.1: catalog.json as array of {id: "R001", ...}
+  //     entries; skill_test_*.json as {doc_name: {R019a: bool, ...}};
+  //     skill_test_阳光资产.json with {doc, results: {R019a: ...}}
+  //   - GLM v0.7.1: rule_stats.json as {D01-01: {PASS, FAIL, NA}, ...};
+  //     full_test_results_v[1-6].json as {sample_id: {path, meta,
+  //     results: {D01-01: {verdict, ...}}}} (nested 2 levels deep, why
+  //     v0.7.1's shallow walk missed them)
+  //
+  // The collector recurses (depth-limited) and uses two heuristics to
+  // separate rule_ids from sample_ids / doc_names:
+  //   1. Rule-id shape: starts with letter, ≤ 30 chars, contains digits
+  //      (matches R001, D01-01, T02-31; rejects 06f2ed1488, doc paths)
+  //   2. Verdict-shape on values: {verdict, passed, pass, PASS, FAIL}
+  //      keys signal that the parent dict's keys are rule_ids
+  const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
+  const isRuleIdShape = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
+  const looksLikeVerdict = (v) =>
+    v && typeof v === "object" && !Array.isArray(v) && (
+      v.verdict !== undefined ||
+      v.passed !== undefined ||
+      v.pass !== undefined ||
+      typeof v.PASS === "number" ||
+      typeof v.FAIL === "number"
+    );
+  const collectFromJsonFile = (data, depth = 0) => {
+    if (!data || depth > 4) return;
+    if (typeof data !== "object") return;
+    if (Array.isArray(data)) {
+      for (const r of data) collectFromJsonFile(r, depth + 1);
+      return;
+    }
+    // {rule_id: "X"} or {id: "R001"} on a rule entry
+    if (isRuleIdShape(data.rule_id)) tested.add(data.rule_id);
+    if (isRuleIdShape(data.id)) tested.add(data.id);
+    // {<rule_id>: <verdict_shaped>, ...}  (rule_stats / per-doc test_results)
+    for (const [k, v] of Object.entries(data)) {
+      if (isRuleIdShape(k) && looksLikeVerdict(v)) tested.add(k);
     }
-    if (data.results && typeof data.results === "object") {
-      for (const k of Object.keys(data.results)) tested.add(k);
+    // {results: {<rule_id>: ...}} — keys must look rule-id-shaped
+    if (data.results && typeof data.results === "object" && !Array.isArray(data.results)) {
+      for (const k of Object.keys(data.results)) {
+        if (isRuleIdShape(k)) tested.add(k);
+      }
+    }
+    // Recurse into nested objects (handles {sample_id: {results: {...}}})
+    for (const v of Object.values(data)) {
+      if (v && typeof v === "object") collectFromJsonFile(v, depth + 1);
     }
   };
   const outputDir = path.join(cwd, "output");
-  if (dirExists(outputDir)) {
-    for (const f of listChildFiles(outputDir)) {
-      if (!f.name.endsWith(".json")) continue;
-      collectFromJsonFile(readJsonSafe(path.join(outputDir, f.name)));
-    }
-    // One level into output/results/, output/distillation/ — the two
-    // most common batch-result locations across E2E #5 and v070 sessions.
-    for (const sub of ["results", "distillation", "qc"]) {
-      const subDir = path.join(outputDir, sub);
-      if (!dirExists(subDir)) continue;
-      for (const f of listChildFiles(subDir)) {
-        if (!f.name.endsWith(".json")) continue;
-        collectFromJsonFile(readJsonSafe(path.join(subDir, f.name)));
-      }
-      // GLM v070 wrote per-rule subdirs under output/results/<rule_id>/
-      // — walk one more level for that pattern.
-      for (const child of listChildDirs(subDir)) {
-        for (const f of listChildFiles(path.join(subDir, child.name))) {
-          if (!f.name.endsWith(".json")) continue;
-          collectFromJsonFile(readJsonSafe(path.join(subDir, child.name, f.name)));
-        }
-      }
-    }
+  for (const p of walkFiles(outputDir, { maxDepth: 6 })) {
+    if (!p.endsWith(".json")) continue;
+    collectFromJsonFile(readJsonSafe(p));
   }
   // DS v070 wrote a top-level aggregate at either rules/test_results.json

package/src/agent/pipelines/finalization.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { PipelineEvent } from "./index.js";
 import { Pipeline } from "./base.js";
 import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
 import { deriveFinalizationMilestones } from "./_milestone-derive.js";
+import { readKcVersion } from "../../util/kc-version.js";
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 // v0.7.0 N: ship template/release/v1/ from the npm package; copy into
@@ -310,12 +311,7 @@ export class FinalizationPipeline extends Pipeline {
   }
   _readKcVersion() {
-    try {
-      const pkg = JSON.parse(fs.readFileSync(
-        path.resolve(__dirname, "../../../package.json"), "utf-8",
-      ));
-      return pkg.version || "unknown";
-    } catch { return "unknown"; }
+    return readKcVersion();
   }
   /**

package/src/agent/pipelines/initializer.js CHANGED Viewed

@@ -240,6 +240,19 @@ export class ProjectInitializer extends Pipeline {
     return this.workspaceCreated && this.configReady && this.hasRegulations && this.hasSamples;
   }
+  // v0.7.2 1e: surface the checklist as engine telemetry so
+  // `_buildEngineCountsBlock("bootstrap")` has something to report when
+  // bootstrap → rule_extraction is refused. Agent sees the missing
+  // criteria directly in the refusal text.
+  describeBootstrapChecklist() {
+    return {
+      workspaceCreated: !!this.workspaceCreated,
+      configReady: !!this.configReady,
+      hasRegulations: !!this.hasRegulations,
+      hasSamples: !!this.hasSamples,
+    };
+  }
   /**
    * v0.6.3 (#74): nudge the agent when it does work that belongs to a later
    * phase. Bootstrap is setup — reading rules/samples, configuring keys,

package/src/agent/tools/release.js CHANGED Viewed

@@ -178,7 +178,21 @@ export class ReleaseTool extends BaseTool {
                        path.join(bundleAbs, "glossary.json"), { fallback: '{"version":1,"entries":[]}\n' });
     this._copyIfExists(path.join(this._workspace.cwd, "corner_cases.json"),
                        path.join(bundleAbs, "corner_cases.json"), { fallback: '[]\n' });
-    this._copyIfExists(path.join(this._workspace.cwd, "confidence_calibration.json"),
+    // v0.7.2 1c: auto-aggregate from output/ if no calibration file at
+    // workspace root. Both v0.7.1 audit runs (DS + GLM) shipped releases
+    // with empty `historical_accuracy: {}` despite having per-rule QC
+    // data on disk under output/ — the release tool just passed the
+    // file through and emitted a stub on miss. We try to populate from
+    // known QC artifact shapes here; if nothing matches, fall through
+    // to the existing stub fallback.
+    const calibSrc = path.join(this._workspace.cwd, "confidence_calibration.json");
+    if (!fs.existsSync(calibSrc)) {
+      const aggregated = this._aggregateAccuracyFromOutput();
+      if (aggregated && Object.keys(aggregated.historical_accuracy).length > 0) {
+        fs.writeFileSync(calibSrc, JSON.stringify(aggregated, null, 2) + "\n", "utf-8");
+      }
+    }
+    this._copyIfExists(calibSrc,
                        path.join(bundleAbs, "confidence_calibration.json"),
                        { fallback: '{"historical_accuracy":{}}\n' });
@@ -233,6 +247,30 @@ export class ReleaseTool extends BaseTool {
       .replace(/\{RULES_LIST\}/g, rulesList);
     fs.writeFileSync(path.join(bundleAbs, "README.md"), readme, "utf-8");
+    // v0.7.2 1d: clean up the template scaffold dir if a customized
+    // release was just written alongside it. Both v0.7.1 audit runs
+    // shipped with `output/releases/v1/` (template-derived, .tmpl
+    // files lingering) AND `output/releases/v1-0/` (or v1-0-hybrid/)
+    // — the customized release. The pre-scaffold is meant as a hint;
+    // once the agent calls `release(label="v1-0")` and we've written
+    // the real bundle, the unedited scaffold is just clutter.
+    //
+    // Conservative gate: only delete a sibling `v1/` if BOTH (a) we
+    // didn't just write to v1/ ourselves, AND (b) it still contains
+    // .tmpl files (signature of unedited template). If the agent
+    // intentionally edited v1/ in place (removing .tmpl), our cleanup
+    // leaves it alone.
+    if (slug !== "v1") {
+      const tmplScaffold = path.join(this._workspace.resolvePath(path.join("output", "releases")), "v1");
+      if (fs.existsSync(tmplScaffold) && fs.statSync(tmplScaffold).isDirectory()) {
+        let hasTmpl = false;
+        try { hasTmpl = fs.readdirSync(tmplScaffold).some((f) => f.endsWith(".tmpl")); } catch { /* ignore */ }
+        if (hasTmpl) {
+          try { fs.rmSync(tmplScaffold, { recursive: true, force: true }); } catch { /* best-effort */ }
+        }
+      }
+    }
     // Bundle dir is in output/ (gitignored). Snapshot manifest in snapshots/ IS tracked.
     const lines = [
       `Release '${label}' bundled at ${bundleRel}`,
@@ -319,6 +357,118 @@ export class ReleaseTool extends BaseTool {
     return null;
   }
+  // v0.7.2 1c: walk output/ for QC artifacts and aggregate per-rule
+  // accuracy. Recognized shapes (covering DS + GLM v0.7.1 audit runs):
+  //
+  //   rule_stats_v*.json — {<rule_id>: {PASS: N, FAIL: N, NOT_APPLICABLE: N, ERROR: N}}
+  //     (GLM produced 4 versions; pick the highest)
+  //   full_test_results_v*.json — {<sample_id>: {results: {<rule_id>: {verdict}}}}
+  //     (GLM; accumulate verdicts per rule across samples)
+  //   skill_test_*.json — {<doc_name>: {<rule_id>: bool}} (DS shape)
+  //
+  // Returns null if no recognized artifact, or an object with
+  //   { historical_accuracy: {<rule_id>: {pass_rate, n_samples, ...}}, computed_at, source_files }
+  // suitable for confidence_calibration.json.
+  _aggregateAccuracyFromOutput() {
+    const ruleIdShape = /^[A-Za-z][A-Za-z0-9_-]{0,29}$/;
+    const isRuleId = (s) => typeof s === "string" && ruleIdShape.test(s) && /\d/.test(s);
+    const tally = new Map();  // rule_id -> {pass, fail, na, n}
+    const sourceFiles = [];
+    const bump = (rid, kind) => {
+      if (!isRuleId(rid)) return;
+      const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
+      t[kind] += 1;
+      t.n += 1;
+      tally.set(rid, t);
+    };
+    const outputDir = path.join(this._workspace.cwd, "output");
+    if (!fs.existsSync(outputDir)) return null;
+    // Collect all .json files under output/ (depth limited)
+    const files = [];
+    const stack = [{ dir: outputDir, depth: 0 }];
+    while (stack.length) {
+      const { dir, depth } = stack.pop();
+      let entries;
+      try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { continue; }
+      for (const e of entries) {
+        if (e.name.startsWith(".") || e.name === "__pycache__") continue;
+        const p = path.join(dir, e.name);
+        if (e.isDirectory()) {
+          if (depth < 6) stack.push({ dir: p, depth: depth + 1 });
+        } else if (e.isFile() && e.name.endsWith(".json")) {
+          files.push({ path: p, name: e.name });
+        }
+      }
+    }
+    // 1) Prefer rule_stats_v<N>.json (highest version) — direct counts
+    const ruleStatsFiles = files
+      .filter((f) => /^rule_stats(?:_v\d+)?\.json$/i.test(f.name))
+      .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
+      .sort((a, b) => b.ver - a.ver);
+    if (ruleStatsFiles.length > 0) {
+      const top = ruleStatsFiles[0];
+      try {
+        const d = JSON.parse(fs.readFileSync(top.path, "utf-8"));
+        for (const [rid, stats] of Object.entries(d)) {
+          if (!isRuleId(rid) || !stats || typeof stats !== "object") continue;
+          const pass = stats.PASS | 0, fail = stats.FAIL | 0;
+          const na = stats.NOT_APPLICABLE | stats.NA | 0;
+          const t = tally.get(rid) || { pass: 0, fail: 0, na: 0, n: 0 };
+          t.pass += pass; t.fail += fail; t.na += na; t.n += pass + fail + na;
+          tally.set(rid, t);
+        }
+        sourceFiles.push(path.relative(this._workspace.cwd, top.path));
+      } catch { /* fall through to other shapes */ }
+    }
+    // 2) Fallback: full_test_results*.json with nested {sample_id: {results: {rid: {verdict}}}}
+    if (tally.size === 0) {
+      const ftrFiles = files
+        .filter((f) => /^full_test_results(?:_v\d+)?\.json$/i.test(f.name))
+        .map((f) => ({ ...f, ver: (f.name.match(/_v(\d+)/) || [0, 0])[1] | 0 }))
+        .sort((a, b) => b.ver - a.ver);
+      for (const f of ftrFiles.slice(0, 1)) {
+        try {
+          const d = JSON.parse(fs.readFileSync(f.path, "utf-8"));
+          for (const sample of Object.values(d)) {
+            if (!sample || typeof sample !== "object") continue;
+            const results = sample.results;
+            if (!results || typeof results !== "object") continue;
+            for (const [rid, r] of Object.entries(results)) {
+              if (!isRuleId(rid) || !r || typeof r !== "object") continue;
+              const verdict = (r.verdict || "").toString().toUpperCase();
+              if (verdict === "PASS") bump(rid, "pass");
+              else if (verdict === "FAIL") bump(rid, "fail");
+              else if (verdict === "NOT_APPLICABLE" || verdict === "NA") bump(rid, "na");
+            }
+          }
+          sourceFiles.push(path.relative(this._workspace.cwd, f.path));
+        } catch { /* try next shape */ }
+      }
+    }
+    if (tally.size === 0) return null;
+    const historical_accuracy = {};
+    for (const [rid, t] of tally.entries()) {
+      const fired = t.pass + t.fail;
+      historical_accuracy[rid] = {
+        pass_rate: fired > 0 ? +(t.pass / fired).toFixed(4) : null,
+        n_passed: t.pass,
+        n_failed: t.fail,
+        n_not_applicable: t.na,
+        n_samples: t.n,
+      };
+    }
+    return {
+      historical_accuracy,
+      computed_at: new Date().toISOString(),
+      source_files: sourceFiles,
+    };
+  }
   _readWorkerTiers() {
     const envPath = path.join(this._workspace.cwd, ".env");
     const out = { tier1: "", tier2: "", tier3: "", tier4: "" };

package/src/util/kc-version.js ADDED Viewed

@@ -0,0 +1,27 @@
+// Single source of truth for the live KC CLI version string.
+//
+// Reads package.json once. Used by engine.js (passed to ReleaseTool so
+// release manifests stamp the correct version) and by
+// pipelines/finalization.js (anywhere it surfaces "Built by kc-beta X").
+//
+// Before v0.7.2, engine.js hardcoded `kcVersion: "0.5.2"` which leaked
+// into every release manifest's `kc_beta_version` field regardless of
+// the actual package version. Both v0.7.1 audit runs (DS + GLM)
+// surfaced this. Reading package.json closes the gap.
+import fs from "node:fs";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+export function readKcVersion() {
+  try {
+    const pkgPath = path.resolve(__dirname, "../../package.json");
+    const pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
+    return pkg.version || "unknown";
+  } catch {
+    return "unknown";
+  }
+}

package/template/skills/en/meta-meta/rule-extraction/SKILL.md CHANGED Viewed

@@ -223,6 +223,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
 Do not skip ambiguous rules. They are often the most important ones.
+## Sanity-check applicability against the sample corpus
+After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
+For every rule:
+1. Walk `samples/`, classify each by product type / report type / document format
+2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
+3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
+E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
+If many rules are 0-sample, either:
+- **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
+- **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
+- **Update the test corpus** to include matching samples (work with the developer user)
+Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
 ## When Rules Change
 Regulations evolve. When the developer user adds new or updated regulation documents:

package/template/skills/en/meta-meta/skill-to-workflow/SKILL.md CHANGED Viewed

@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
 ### The hybrid approach (most common)
 Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
+### When regex alone isn't enough — decision rubric
+Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
+- **Semantic** ("is this a positive guarantee or a disclaimer?")
+- **Contextual** ("interpret this in light of the document's product type")
+- **Counterfactual** ("what should this value be, given the other fields?")
+- **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
+regex alone rarely suffices. Three acceptable forms:
+1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
+2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
+3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
+Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
+### Worker LLM cost-aware tier choice
+If you do escalate to LLM:
+- **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
+- **tier2-3**: bulk extraction with simple semantic checks
+- **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
+Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
 ## Workflow Structure
 A workflow is a Python file (or small set of files) in `workflows/`:

package/template/skills/en/meta-meta/work-decomposition/SKILL.md CHANGED Viewed

@@ -147,6 +147,41 @@ E2E #6 v070 surfaced this pattern (DS bundled-skill check.py files
 all returned `{"pass": null, "method": "stub"}` deferring to
 workflows/). v0.7.1 added this anti-pattern explicitly.
+E2E #7 v071 showed the teaching prevented the stub anti-pattern in
+both conductors (no `{"pass": null}` patterns in either run), but
+**DS still inverted the canonical-vs-distilled relationship**: DS's
+6 thematic skill folders had SKILL.md only (no check.py), with the
+real verification code living in `workflows/<skill>/check.py`. The
+absence of stubs is good; the inversion is not — editing a rule then
+requires touching both SKILL.md (the doc) and the workflow check.py
+(the code). Single source of truth is lost.
+GLM v071 by contrast landed the canonical pattern: 97/97 skills had
+both SKILL.md AND a real `check.py` (median 143 LOC of regex +
+applicability logic), and `workflows/<id>/workflow_v1.py` was a
+50-line thin wrapper that imported and called it:
+```python
+# workflows/D01-01/workflow_v1.py — thin wrapper, 52 LOC
+import importlib.util, json
+from pathlib import Path
+def run(doc_text: str, meta: dict = None) -> dict:
+    check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
+    spec = importlib.util.spec_from_file_location("check", check_path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    result = mod.check(doc_text, meta)
+    result["_workflow"] = "D01-01_v1"
+    return result
+```
+This is the v0.7.2+ canonical pattern: workflow is a shim that
+points at the skill's check.py. To iterate on a rule's verification,
+edit `rule_skills/<id>/check.py`. The workflow doesn't change. v0.7.2
+clarifies the teaching: avoid stubs AND keep the canonical
+relationship (skill is canonical, workflow is distilled wrapper).
 ### Naming convention for grouped checks
 When you do bundle, name the file with the explicit range:
@@ -309,18 +344,26 @@ When entering skill_authoring with an empty TaskBoard:
 5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
 6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
-### Why PATTERNS.md FIRST, before any skill code
+### Persisted methodology — PATTERNS.md OR phase logs OR AGENT.md decisions
+The principle: capture framework-level decisions to disk before each phase advance. The conversation will compact, agents will restart, the next phase will lose grounding. Whichever format you pick, write to disk — don't rely on conversation context that disappears.
+Three formats, each defensible. Pick one and stick with it:
+- **`rules/PATTERNS.md`** — concise, framework-only, updated as the project progresses. Best for greenfield projects with clear hypothesis-up-front structure. Capped at ~5 KB; entries are transferable shapes / project constraints / anti-patterns with rationale (see "What to write" above).
-If you start writing skill code (rule_skills/<id>/check.py) before PATTERNS.md exists, **stop**. Even a 200-byte initial PATTERNS.md ("decided Shannon-Huffman; first hard rule R028 will dictate verdict shape; sample corpus has bilingual table headings") sets the framework. You'll save 4× the time later not re-deriving the same shapes per rule.
+- **`logs/phase_<name>_complete.md` per phase** — incremental, captures what each phase produced + decisions made + what the next phase inherits. Best for iterative discovery work where the framework crystallizes mid-run. E2E #7 GLM used this pattern across 6 phase docs and an `evolution_summary_v1.2.md`; the methodology was captured even though PATTERNS.md was never written.
-❌ "I'll write the skills first, then PATTERNS.md when I have insights."
+- **`AGENT.md` decisions section + domain notes** — narrative-style, living document of "what we know" and "why". Best for projects with rich domain context to capture (regulations, edge cases, thresholds, sample format distributions). E2E #7 GLM's AGENT.md included regulation enforcement dates, product type taxonomies, threshold values, and sample format counts — this is fine; it's a different idiom for the same goal.
-By the time you have N skills, you've made N implicit decisions about verdict shape, chunker boundaries, worker tier — each rule re-derives from scratch. Refactoring requires touching N files instead of one.
+What you should NOT do: skip persistence and rely only on the live conversation context. By the time you have N skills authored without any persisted methodology, you've made N implicit decisions about verdict shape, chunker boundaries, and worker tier. Each rule re-derives from scratch. Refactoring requires touching N files instead of one.
-✅ "Write PATTERNS.md, even tentatively, then re-read it before each new rule. Update it when discoveries change the framework."
+❌ "I'll capture insights when I have time."
-PATTERNS.md is your project's index card. Build it before the work, update it during the work, harvest it after.
+✅ "Before each phase advance, write what I learned to whichever persistence file matches this project's idiom — even if it's tentative."
-E2E #6 v070 surfaced this: DS only wrote PATTERNS.md after a rollback intervention; the per-skill design decisions before that point were already locked in and had to be re-touched. v0.7.1 reinforced this guidance.
+E2E history:
+- E2E #6 v070 DS wrote PATTERNS.md only after a rollback. Per-skill decisions before that point had to be re-touched. v0.7.1 added "PATTERNS.md FIRST" reinforcement.
+- E2E #7 v071 neither DS nor GLM wrote PATTERNS.md, but GLM wrote 6 rich phase-completion logs and a comprehensive AGENT.md — the methodology WAS captured, just in different files. v0.7.2 blesses the broader principle: persist before you advance, format flexible.
-The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.
+The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract; the persistence file is your project's memory.

package/template/skills/zh/meta-meta/rule-extraction/SKILL.md CHANGED Viewed

@@ -222,6 +222,24 @@ Regulations are often ambiguous. When you encounter ambiguity:
 Do not skip ambiguous rules. They are often the most important ones.
+## Sanity-check applicability against the sample corpus
+After extracting your rule catalog and before authoring skills, do this 5-minute check: project each rule's applicability filter against the sample corpus.
+For every rule:
+1. Walk `samples/`, classify each by product type / report type / document format
+2. For each rule, count how many samples it would apply to (per the rule's `applicability` field, scope filter, or whatever shape your catalog uses)
+3. Flag rules that apply to **0 samples** — they're either genuinely test-corpus-irrelevant (acceptable) or over-constrained (bug)
+E2E #7 GLM produced a 97-rule catalog where 36 rules (37%) had `PASS=0 FAIL=0 NOT_APPLICABLE=90` across all 90 documents — they never fired. Some were legit (rules for cash-management products with no cash-management samples in corpus), but 36 inactive of 97 was high enough to suggest scope-too-narrow drift.
+If many rules are 0-sample, either:
+- **Reframe their applicability** — broaden product types, look for evidence in headers/footers not just body, relax the scope filter
+- **Document them as "future scope"** and remove from this iteration's catalog (still capture them in a `rules/future_scope.md` so they're not forgotten)
+- **Update the test corpus** to include matching samples (work with the developer user)
+Catching this in `rule_extraction` is much cheaper than authoring 36 skills that then test as inactive in `skill_testing`. The cheap projection here is worth the time it saves later.
 ## When Rules Change
 Regulations evolve. When the developer user adds new or updated regulation documents:

package/template/skills/zh/meta-meta/skill-to-workflow/SKILL.md CHANGED Viewed

@@ -45,6 +45,32 @@ If yes, design a worker LLM prompt. Use the smallest model tier that maintains a
 ### The hybrid approach (most common)
 Most rules are a mix: regex extracts the number, Python compares it to the threshold, LLM handles the exceptional cases. Design the workflow as a pipeline where cheap steps run first and expensive steps run only when needed.
+### When regex alone isn't enough — decision rubric
+Before declaring distillation complete, audit each rule's `verification_type` / `metric` / `evidence_type` (or equivalent fields in your catalog). For rules where the required verification is one of:
+- **Semantic** ("is this a positive guarantee or a disclaimer?")
+- **Contextual** ("interpret this in light of the document's product type")
+- **Counterfactual** ("what should this value be, given the other fields?")
+- **Cross-field arithmetic** ("does 期初 + 收益 - 分配 = 期末?")
+regex alone rarely suffices. Three acceptable forms:
+1. **Pure regex with documented limits** — write the regex check, include a comment explaining the fragility (e.g., "matches syntactic pattern only; cannot detect semantic guarantees")
+2. **Hybrid regex + LLM** — regex baseline catches obvious cases, `worker_llm_call` (tier1-2) handles ambiguous ones. The hybrid workflow declares which rule_ids escalate.
+3. **Pure LLM via `worker_llm_call`** — for fully semantic rules where no regex baseline is meaningful.
+Don't ship pure regex for a rule whose `verification_type` is `judgment` / `semantic` without the documented-limits note. Future-you or a colleague will assume the regex is sufficient and that bug will hide for months.
+### Worker LLM cost-aware tier choice
+If you do escalate to LLM:
+- **tier1** (most capable, ~¥0.001-0.002/doc): cross-field reasoning, ambiguity resolution, rules that benefit from chain-of-thought
+- **tier2-3**: bulk extraction with simple semantic checks
+- **tier4** (cheapest): high-volume keyword-spotting that regex can't handle. Note: tier4 models on SiliconFlow are Qwen3.5 thinking-mode — `content` can return empty if `reasoning_content` consumes max_tokens. Test with realistic prompts before relying. If you see empty responses, either bump max_tokens to ≥8192, shorten your prompt, or fall back to tier1-2.
+Both v0.7.1 audit conductors (DS and GLM) defaulted to all-regex distillation and only added LLM escalation when the human user explicitly asked for "V2 with worker LLM". If your rule catalog has any rules where the verification is genuinely semantic, you should reach for `worker_llm_call` yourself — don't wait to be asked.
 ## Workflow Structure
 A workflow is a Python file (or small set of files) in `workflows/`:

package/template/skills/zh/meta-meta/work-decomposition/SKILL.md CHANGED Viewed

@@ -144,6 +144,39 @@ E2E #6 v070 暴露了这个反模式（DS 把所有 bundled skill 的 check.py
 都写成 `{"pass": null, "method": "stub"}` 推给 workflows/）。
 v0.7.1 把这个反模式显式写进 skill。
+E2E #7 v071 显示这个反 stub 的引导在两个 conductor 上都生效（两条 run
+里都没有 `{"pass": null}` 这种 stub 模式），但是 **DS 仍然把"正典 vs
+蒸馏"的关系搞反了**：DS 写了 6 个主题分组的 skill 文件夹，每个只有
+SKILL.md（没有 check.py），真正的验证代码却在
+`workflows/<skill>/check.py` 里。没有 stub 是好事；关系搞反不是 ——
+要修改一条规则的逻辑就得同时改 SKILL.md（文档）和 workflow check.py
+（代码），单一信息源就丢了。
+GLM v071 反而把正典模式落地了：97/97 个 skill 都同时有 SKILL.md 和
+真正的 `check.py`（regex + 适用性判断的代码，中位 143 行），而
+`workflows/<id>/workflow_v1.py` 是一个 50 行的薄壳，只是 import 并
+调用 skill 的 check.py：
+```python
+# workflows/D01-01/workflow_v1.py — 薄壳，52 行
+import importlib.util, json
+from pathlib import Path
+def run(doc_text: str, meta: dict = None) -> dict:
+    check_path = Path(__file__).parent.parent.parent / "rule_skills" / "D01-01" / "check.py"
+    spec = importlib.util.spec_from_file_location("check", check_path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    result = mod.check(doc_text, meta)
+    result["_workflow"] = "D01-01_v1"
+    return result
+```
+这是 v0.7.2+ 的正典模式：workflow 是个壳，指向 skill 的 check.py。
+迭代规则验证逻辑时，编辑 `rule_skills/<id>/check.py`，workflow 不用动。
+v0.7.2 把引导说得更清楚：既不要 stub，也要保留正典关系（skill 是
+正典，workflow 是蒸馏过的薄壳）。
 ### 合并 check 的命名约定
 确实需要合并时，文件名要把范围写明：
@@ -304,18 +337,26 @@ PATTERNS.md 全文控制在约 5 KB 之内。超过时，剪掉最不可执行
 5. **挑第一个任务**。做到完整（skill + check + 至少一次本地测试）。把学到的写进 PATTERNS.md。换下一个任务。
 6. **任务做到第 5 个、第 10 个时**：停下来重读 PATTERNS.md。如果新积累的 pattern 暗示要重构早期工作，**现在做**（便宜）而不是更晚（昂贵）。
-### 为什么 PATTERNS.md 要先写、写在 skill 代码之前
+### 持久化方法论 —— PATTERNS.md 或 phase 日志 或 AGENT.md decisions
+原则：在每次 phase 推进之前，把框架级的决定写到磁盘。对话会被 compact、agent 会重启、下一个 phase 会失去上下文。不管你选哪种格式，**写到磁盘** —— 不要依赖会消失的对话上下文。
+三种格式都站得住，挑一种坚持下去：
+- **`rules/PATTERNS.md`** —— 简洁，只装框架级内容，随项目推进而更新。适合假设可以前置、结构清晰的全新项目。上限 ~5 KB；条目是可迁移的形状 / 项目级约束 / 反模式加原因（参考上面"该写什么"一节）。
-如果你在 PATTERNS.md 还不存在的时候就开始写 skill 代码（rule_skills/<id>/check.py），**停**。哪怕只是 200 字节的初始 PATTERNS.md（"决定走 Shannon-Huffman；第一条难规则 R028 决定 verdict 形状；样本语料表头中英双语"）也能搭起框架。后续每条规则少重新推导一次同样的形状，整体能省 4 倍时间。
+- **每阶段写 `logs/phase_<name>_complete.md`** —— 增量式，记录每个 phase 产出了什么、做了哪些决定、下个 phase 继承什么。适合"边发现边定型"的迭代式工作。E2E #7 GLM 用了这个模式：6 篇 phase 文档 + `evolution_summary_v1.2.md`，方法论照样捕获了，只是没写 PATTERNS.md。
-❌ "我先把 skill 写完，等有洞察再写 PATTERNS.md。"
+- **`AGENT.md` decisions 段 + 领域笔记** —— 叙事风格，是关于"我们知道什么"和"为什么"的活文档。适合需要捕获丰富领域上下文的项目（法规、边缘案例、阈值、样本格式分布）。E2E #7 GLM 的 AGENT.md 里有法规生效日期、产品类型分类、阈值数值、样本格式数量 —— 完全 OK，是相同目标的不同惯用法。
-到你写完 N 个 skill 时，你已经做了 N 个隐式决定（verdict 形状、chunker 边界、worker tier）——每条规则都是从零推导。重构需要碰 N 个文件，而不是一个。
+不该做的事：跳过持久化、只靠对话上下文活着。等你写到第 N 条 skill 还没把方法论写到磁盘时，你已经做了 N 个关于 verdict 形状、chunker 边界、worker tier 的隐式决定 —— 每条规则都从零推导，重构要碰 N 个文件而不是一个。
-✅ "先写 PATTERNS.md（哪怕是初步的），写每条新规则之前先重读，发现新东西就回头更新。"
+❌ "等我有空再来记录这些洞察。"
-PATTERNS.md 是项目的索引卡片。工作之前搭好它、工作中更新它、工作之后从中收割。
+✅ "每次 phase 推进之前，把这一阶段学到的东西写到适合本项目惯用法的那个持久化文件里 —— 哪怕只是初稿。"
-E2E #6 v070 暴露了这个：DS 在用户介入回退之后才写 PATTERNS.md，而那之前每条 skill 的设计决定都已经各自固化、之后还要再碰一遍。v0.7.1 把这个引导写得更明确。
+E2E 历史：
+- E2E #6 v070 DS 在用户介入回退之后才写 PATTERNS.md。那之前每条 skill 的设计决定都各自固化，之后还要再碰一遍。v0.7.1 加了"PATTERNS.md FIRST"的引导。
+- E2E #7 v071 DS 和 GLM 都没写 PATTERNS.md，但 GLM 写了 6 篇 phase 完成日志和一份内容详尽的 AGENT.md —— 方法论 *捕获了*，只是放在了不同文件里。v0.7.2 把更宽的原则写进 skill：推进之前先持久化，格式灵活。
-引擎从文件系统推导里程碑（v0.7.0 Group A）会按磁盘事实核验覆盖率，无论你怎么切分工作。TaskBoard 是你的草稿；磁盘才是契约。
+引擎从文件系统推导里程碑（v0.7.0 Group A）会按磁盘事实核验覆盖率，无论你怎么切分工作。TaskBoard 是你的草稿；磁盘才是契约；持久化文件是项目的记忆。