npm - kc-beta - Versions diffs - 0.7.0 → 0.7.1 - Mend

kc-beta 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json +1 -1
package/src/agent/engine.js +15 -1
package/src/agent/pipelines/_milestone-derive.js +58 -3
package/src/agent/pipelines/extraction.js +27 -0
package/src/agent/pipelines/skill-testing.js +39 -0
package/src/agent/tools/phase-advance.js +20 -3
package/template/skills/en/meta-meta/work-decomposition/SKILL.md +60 -0
package/template/skills/zh/meta-meta/work-decomposition/SKILL.md +57 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "kc-beta",
-  "version": "0.7.0",
+  "version": "0.7.1",
   "description": "KC Agent — LLM document verification agent (pure Node.js CLI). Dual-licensed: PolyForm Noncommercial 1.0.0 for personal/noncommercial use; commercial license required for enterprise production. See LICENSE and LICENSE-COMMERCIAL.md.",
   "type": "module",
   "bin": {

package/src/agent/engine.js CHANGED Viewed

@@ -423,7 +423,21 @@ export class AgentEngine {
         new ScheduleFetchTool(this.workspace),
         new ReleaseTool(this.workspace, { kcVersion: "0.5.2" }),
         new PhaseAdvanceTool(
-          (to, reason, opts) => this._advancePhase(to, reason, opts),
+          // v0.7.1 2c: advanceFn returns rich `{advanced, engineCounts?}`
+          // so the tool's refusal text can surface the engine telemetry
+          // that motivated the refusal. Internal callers of
+          // `_advancePhase` continue to use the bool return value
+          // directly; only this lambda wraps for the LLM-facing tool.
+          (to, reason, opts) => {
+            const advanced = this._advancePhase(to, reason, opts);
+            if (!advanced) {
+              let engineCounts = null;
+              try { engineCounts = this._buildEngineCountsBlock(this.currentPhase); }
+              catch { /* defensive */ }
+              return { advanced: false, engineCounts };
+            }
+            return { advanced: true };
+          },
           () => this.currentPhase, // H1: tool reads phase BEFORE its own call
           // v0.6.2 J1: surface running subagents so the tool can refuse
           // advance until the agent explicitly acknowledges them.

package/src/agent/pipelines/_milestone-derive.js CHANGED Viewed

@@ -250,7 +250,8 @@ export function deriveSkillAuthoringMilestones(workspace) {
 export function deriveSkillTestingMilestones(workspace) {
   const cwd = cwdOf(workspace);
   const skillsDir = path.join(cwd, "rule_skills");
-  const skillsTested = [];
+  // Use a Set so the v0.7.1 1a output/-side scan can add without duplicates.
+  const tested = new Set();
   if (dirExists(skillsDir)) {
     for (const e of listChildDirs(skillsDir)) {
@@ -266,14 +267,68 @@ export function deriveSkillTestingMilestones(workspace) {
         fileExists(path.join(skillPath, "assets", "test_cases.json")) ||
         listChildFiles(skillPath).some((f) =>
           /^(test|.*_test)_(output|result|log)/i.test(f.name) && f.name.endsWith(".json"));
-      if (hasTestArtifact) skillsTested.push(e.name);
+      if (hasTestArtifact) tested.add(e.name);
     }
   }
+  // v0.7.1 1a: also credit rules whose verdicts appear in output/*.json.
+  // Agents naturally write batch-test results to output/, not per-skill
+  // paths. v0.6.x's _loadTestResults already reads here on the canonical
+  // accuracy schema; this expands the helper-derived milestone to
+  // recognize the same shape (plus the GLM/DS-shape variants seen in
+  // E2E #6 v070). Without this, agents who run tests via sandbox_exec
+  // and persist to output/ saw skillsTested=0 and force-bypassed.
+  const collectFromJsonFile = (data) => {
+    if (!data) return;
+    if (data.rule_id) tested.add(data.rule_id);
+    if (Array.isArray(data) && data[0] && typeof data[0] === "object" && data[0].rule_id) {
+      for (const r of data) if (r?.rule_id) tested.add(r.rule_id);
+    }
+    if (data.results && typeof data.results === "object") {
+      for (const k of Object.keys(data.results)) tested.add(k);
+    }
+  };
+  const outputDir = path.join(cwd, "output");
+  if (dirExists(outputDir)) {
+    for (const f of listChildFiles(outputDir)) {
+      if (!f.name.endsWith(".json")) continue;
+      collectFromJsonFile(readJsonSafe(path.join(outputDir, f.name)));
+    }
+    // One level into output/results/, output/distillation/ — the two
+    // most common batch-result locations across E2E #5 and v070 sessions.
+    for (const sub of ["results", "distillation", "qc"]) {
+      const subDir = path.join(outputDir, sub);
+      if (!dirExists(subDir)) continue;
+      for (const f of listChildFiles(subDir)) {
+        if (!f.name.endsWith(".json")) continue;
+        collectFromJsonFile(readJsonSafe(path.join(subDir, f.name)));
+      }
+      // GLM v070 wrote per-rule subdirs under output/results/<rule_id>/
+      // — walk one more level for that pattern.
+      for (const child of listChildDirs(subDir)) {
+        for (const f of listChildFiles(path.join(subDir, child.name))) {
+          if (!f.name.endsWith(".json")) continue;
+          collectFromJsonFile(readJsonSafe(path.join(subDir, child.name, f.name)));
+        }
+      }
+    }
+  }
+  // DS v070 wrote a top-level aggregate at either rules/test_results.json
+  // OR rule_skills/test_results.json. Both seen in the wild; check both.
+  for (const candidate of [
+    path.join(cwd, "rules", "test_results.json"),
+    path.join(cwd, "rule_skills", "test_results.json"),
+    path.join(cwd, "test_results.json"),
+  ]) {
+    if (fileExists(candidate)) collectFromJsonFile(readJsonSafe(candidate));
+  }
   // skillsPassing — per-skill accuracy threshold. Without a uniform
   // schema across agent outputs we report `tested` as the floor; the
   // pipeline's existing _loadTestResults() can layer accuracy on top.
-  return { skillsTested };
+  return { skillsTested: [...tested] };
 }
 // ───────────────────────────────────────────────────────────────────

package/src/agent/pipelines/extraction.js CHANGED Viewed

@@ -205,6 +205,33 @@ export class RuleExtractionPipeline extends Pipeline {
       return `workflow_run is SKILL_TESTING/PRODUCTION_QC-phase work, but engine is in RULE_EXTRACTION. ${exitText}`;
     }
+    // v0.7.1 2a/2b: when agent attempts phase_advance from rule_extraction,
+    // surface advisories for the two soft-but-load-bearing artifacts the
+    // gate criteria require (chunk_refs and coverage_audit). v0.7.0 GLM
+    // session forced through with both missing — gate refused for the
+    // right reason but the refusal text was generic. Name them inline.
+    if (toolName === "phase_advance" && toolInput?.to === "skill_authoring") {
+      const advisories = [];
+      if (this.rulesExtracted.length > 0 && this.rulesWithChunkRefs.length === 0) {
+        advisories.push(
+          `Advancing rule_extraction with rulesWithChunkRefs=0/${this.rulesExtracted.length}. ` +
+          `The skill_authoring phase's prompts use source_chunk_ids to ground ` +
+          `skill explanations against regulation text. Without them, skill authoring ` +
+          `runs blind. Either populate chunk refs via the rule_catalog tool, or ` +
+          `accept that skill_authoring's generated content won't cite source regulation.`,
+        );
+      }
+      if (this.rulesExtracted.length > 0 && !this.coverageAudited) {
+        advisories.push(
+          `Advancing rule_extraction without rules/coverage_audit.md (or .json). ` +
+          `Coverage audit identifies regulation articles you didn't extract a rule ` +
+          `for — without it, gaps go silent through to production. If your ` +
+          `extraction is genuinely complete, write a one-paragraph audit confirming so.`,
+        );
+      }
+      if (advisories.length > 0) return advisories.join("\n\n");
+    }
     return null;
   }

package/src/agent/pipelines/skill-testing.js CHANGED Viewed

@@ -14,6 +14,11 @@ export class SkillTestingPipeline extends Pipeline {
     this.iterationCount = 0;
     this._accuracyThreshold = 0.9;
     this._maxIterations = 20;
+    // v0.7.1 1b: rate-limit phaseMisfitHint firing for ephemeral
+    // sandbox tests. Caps at ~3 nudges per phase entry so the agent
+    // sees the path expectation but doesn't get spammed during a
+    // batch run.
+    this._misfit_nudge_count = 0;
     this._scanWorkspace();
   }
@@ -132,6 +137,12 @@ export class SkillTestingPipeline extends Pipeline {
    * v0.6.3 (#74): SKILL_TESTING runs check scripts against test samples and
    * measures accuracy. Writing distillation outputs or production results
    * here means phase boundaries got skipped.
+   *
+   * v0.7.1 1b: also nudges agents who run check scripts via sandbox_exec
+   * but don't persist verdicts. E2E #6 v070 surfaced this — both
+   * conductors batched tests in one sandbox_exec call, read pass/fail
+   * from stdout, then declared "testing done" while engine saw
+   * skillsTested=0 because nothing landed in a recognized path.
    */
   phaseMisfitHint(toolName, toolInput, result) {
     if (result?.isError) return null;
@@ -148,6 +159,34 @@ export class SkillTestingPipeline extends Pipeline {
         return `Writing under output/results/ is PRODUCTION_QC-phase work, but engine is in SKILL_TESTING. ${exitText}`;
       }
     }
+    // v0.7.1 1b: sandbox_exec test-command nudge
+    if (toolName === "sandbox_exec") {
+      const cmd = String(toolInput?.command || "");
+      const looksLikeTest =
+        /python.*check.*\.py.*\.(txt|pdf|md|docx)/i.test(cmd) ||
+        /pytest|unittest|run_tests/i.test(cmd) ||
+        /python.*workflow.*\.py.*samples/i.test(cmd);
+      if (!looksLikeTest) return null;
+      const tested = Object.keys(this.skillsTested).length;
+      const total = this.skillsToTest.length;
+      // Already satisfied? Don't nudge.
+      if (total === 0 || tested >= total) return null;
+      // Rate-limit: ~3 per phase. Counter resets on phase entry
+      // (constructor) and on importState if available.
+      this._misfit_nudge_count = (this._misfit_nudge_count || 0) + 1;
+      if (this._misfit_nudge_count > 3) return null;
+      return (
+        `Engine derives skillsTested from rule_skills/<id>/test_results.json, ` +
+        `rule_skills/<id>/tests/, OR output/*.json with rule_id field. ` +
+        `Sandbox runs are ephemeral — record per-rule verdicts to one of ` +
+        `those paths before phase_advance. Currently engine sees ` +
+        `${tested}/${total} skills tested.`
+      );
+    }
     return null;
   }

package/src/agent/tools/phase-advance.js CHANGED Viewed

@@ -15,7 +15,11 @@ const VALID_PHASES = new Set(Object.values(Phase));
  */
 export class PhaseAdvanceTool extends BaseTool {
   /**
-   * @param {(to: string, reason: string, opts: {force?: boolean}) => boolean} advanceFn
+   * @param {(to: string, reason: string, opts: {force?: boolean}) => {advanced: boolean, engineCounts?: string}} advanceFn
+   *   v0.7.1 2c: returns the rich object so the tool can surface engine
+   *   telemetry in the refusal text. Internal engine callers of
+   *   `_advancePhase` still get the bool; only this LLM-facing tool
+   *   uses the wrapped form.
    * @param {() => string} getCurrentPhaseFn - H1: lets the tool read the
    *   engine's phase BEFORE the call, so it can distinguish "already there"
    *   (silent no-op, informational) from "non-adjacent refusal" (actionable).
@@ -91,7 +95,11 @@ export class PhaseAdvanceTool extends BaseTool {
       );
     }
-    const advanced = this._advance(to, input.reason || "agent request", { force: !!input.force });
+    // v0.7.1 2c: advanceFn returns {advanced, engineCounts?} so we can
+    // surface telemetry in the refusal text below. Internal callers of
+    // _advancePhase still get bool; only this LLM-facing tool unwraps.
+    const advanceResult = this._advance(to, input.reason || "agent request", { force: !!input.force });
+    const advanced = !!advanceResult?.advanced;
     if (advanced) {
       // Log the ack so post-mortems can find phase advances that proceeded
       // with live subagents
@@ -113,9 +121,18 @@ export class PhaseAdvanceTool extends BaseTool {
     // immediately (12/12 transitions). The escape valve remains in the input
     // schema (discoverable) but isn't hand-fed to the LLM here. Instead,
     // direct the agent at the missing milestones it can satisfy.
+    //
+    // v0.7.1 2c: include engineCounts when available so the agent sees
+    // exactly which milestones the gate is reading and can satisfy them.
+    // E2E #6 v070 showed the generic "check /status" hint wasn't concrete
+    // enough — agents forced through. Naming the gap inline reduces that.
+    const engineCountsLine = advanceResult?.engineCounts
+      ? `\nEngine telemetry: ${advanceResult.engineCounts}`
+      : "";
     return new ToolResult(
       `Did not advance to ${to} (currently in ${beforePhase || "?"}). ` +
-      `Likely cause: source-phase exit criteria not met. ` +
+      `Likely cause: source-phase exit criteria not met.${engineCountsLine}\n\n` +
       `Run /status (or read the phase describeState block in this turn's system reminder) ` +
       `to see which milestones are missing, then produce the disk artifacts that satisfy them — ` +
       `the engine derives milestones from filesystem facts (rule_skills/<id>/SKILL.md, check.py, ` +

package/template/skills/en/meta-meta/work-decomposition/SKILL.md CHANGED Viewed

@@ -101,6 +101,52 @@ The v0.6.2 D2 anti-pattern wording captures the failure case clearly:
 That came from E2E #4 where one conductor wrote a 2,400-line `unified_qc.py` that ran all rules at once. It produced 1,150 ERROR verdicts (16.6%) because every rule's failure cascaded into every other rule's verdict. Per-rule skills are KC's unit of granularity for a reason.
+### Anti-pattern: stub check.py + real workflow.py
+Do NOT make `rule_skills/<id>/check.py` a stub that defers to
+`workflows/<id>/workflow.py`. KC's intent: SKILL.md + check.py is the
+**canonical** verification. workflow.py is the **distilled, cheaper**
+form (regex baseline + LLM fallback). The relationship is
+skill → workflow, not workflow → skill.
+❌ DON'T:
+```python
+# rule_skills/R001/check.py — STUB, real logic elsewhere
+def check(text):
+    rule_ids = re.findall(r"R\d{3}", load_skill())
+    return {rid: {"pass": None, "method": "stub",
+                  "note": "to be implemented later"} for rid in rule_ids}
+# real verification logic only in workflows/R001/workflow_v1.py
+```
+✅ DO:
+```python
+# rule_skills/R001/check.py — canonical verification
+def check(text):
+    matches = re.findall(r"...", text)  # actual rule logic
+    return {"rule_id": "R001", "passed": bool(matches),
+            "evidence": matches[:3], "method": "regex"}
+# workflows/R001/workflow_v1.py — distilled, cheaper form
+def run(text, llm_fn=None):
+    result = check(text)             # baseline from skill
+    if not result["passed"] and llm_fn:
+        result = llm_verify(text, llm_fn)  # escalate on fail
+    return result
+```
+Why it matters: distillation phase consumers (release tool, run.py
+harness) load workflow.py. If check.py is a stub, the skill's
+methodology (SKILL.md) becomes documentation-only and the
+verification logic is scattered across N workflow files. Future
+iterations of the skill (changes to regulation interpretation, edge
+cases discovered in production) need a single canonical place to
+update — the skill — not N workflows that have drifted independently.
+E2E #6 v070 surfaced this pattern (DS bundled-skill check.py files
+all returned `{"pass": null, "method": "stub"}` deferring to
+workflows/). v0.7.1 added this anti-pattern explicitly.
 ### Naming convention for grouped checks
 When you do bundle, name the file with the explicit range:
@@ -263,4 +309,18 @@ When entering skill_authoring with an empty TaskBoard:
 5. **Pick the first task.** Work it to completion (skill + check + at least one local test). Update PATTERNS.md with whatever you learned. Move to the next task.
 6. **At task ~5 and task ~10:** stop and re-read PATTERNS.md. If patterns suggest a refactor of earlier work, do it now (cheap) rather than later (expensive).
+### Why PATTERNS.md FIRST, before any skill code
+If you start writing skill code (rule_skills/<id>/check.py) before PATTERNS.md exists, **stop**. Even a 200-byte initial PATTERNS.md ("decided Shannon-Huffman; first hard rule R028 will dictate verdict shape; sample corpus has bilingual table headings") sets the framework. You'll save 4× the time later not re-deriving the same shapes per rule.
+❌ "I'll write the skills first, then PATTERNS.md when I have insights."
+By the time you have N skills, you've made N implicit decisions about verdict shape, chunker boundaries, worker tier — each rule re-derives from scratch. Refactoring requires touching N files instead of one.
+✅ "Write PATTERNS.md, even tentatively, then re-read it before each new rule. Update it when discoveries change the framework."
+PATTERNS.md is your project's index card. Build it before the work, update it during the work, harvest it after.
+E2E #6 v070 surfaced this: DS only wrote PATTERNS.md after a rollback intervention; the per-skill design decisions before that point were already locked in and had to be re-touched. v0.7.1 reinforced this guidance.
 The engine's filesystem-derived milestones (Group A v0.7.0) verify coverage on disk regardless of how you split the work. The TaskBoard is your scratchpad; the disk is the contract.

package/template/skills/zh/meta-meta/work-decomposition/SKILL.md CHANGED Viewed

@@ -101,6 +101,49 @@ v0.6.2 D2 的反模式说法已经把失败情形说得很清楚了：
 那段话来自 E2E #4：一个指挥模型写了 2,400 行 `unified_qc.py` 一次性跑所有规则。结果出现 1,150 条 ERROR verdict（16.6%），因为每条规则的失败都连带把所有其他规则的判定也带崩了。Per-rule skill 是 KC 的粒度单元，这是有原因的。
+### 反模式：check.py 是 stub + workflow.py 才是真逻辑
+**不要**把 `rule_skills/<id>/check.py` 写成一个把真实逻辑推迟到
+`workflows/<id>/workflow.py` 的占位文件。KC 的设计意图是：SKILL.md
++ check.py 是**正典**核查；workflow.py 是**蒸馏后、更便宜**的形式
+（regex 优先 + LLM 回退）。关系是 skill → workflow，不是反过来。
+❌ 不要这样：
+```python
+# rule_skills/R001/check.py —— STUB，真逻辑在别处
+def check(text):
+    rule_ids = re.findall(r"R\d{3}", load_skill())
+    return {rid: {"pass": None, "method": "stub",
+                  "note": "待技能测试阶段实现"} for rid in rule_ids}
+# 实际核查逻辑只在 workflows/R001/workflow_v1.py 里
+```
+✅ 应该这样：
+```python
+# rule_skills/R001/check.py —— 正典核查
+def check(text):
+    matches = re.findall(r"...", text)  # 真实规则逻辑
+    return {"rule_id": "R001", "passed": bool(matches),
+            "evidence": matches[:3], "method": "regex"}
+# workflows/R001/workflow_v1.py —— 蒸馏后的便宜形式
+def run(text, llm_fn=None):
+    result = check(text)             # skill 提供基线
+    if not result["passed"] and llm_fn:
+        result = llm_verify(text, llm_fn)  # FAIL 时升级到 LLM
+    return result
+```
+为什么重要：蒸馏阶段下游消费者（release 工具、run.py 运行器）加载
+的是 workflow.py。如果 check.py 是 stub，skill 的方法论（SKILL.md）
+就只剩文档作用，而核查逻辑被分散到 N 个 workflow 文件里。后续对
+skill 的迭代（法规解释变化、生产中发现的边缘情形）需要一个**正典
+位置**来更新——也就是 skill——而不是 N 个已经各自漂移的 workflow。
+E2E #6 v070 暴露了这个反模式（DS 把所有 bundled skill 的 check.py
+都写成 `{"pass": null, "method": "stub"}` 推给 workflows/）。
+v0.7.1 把这个反模式显式写进 skill。
 ### 合并 check 的命名约定
 确实需要合并时，文件名要把范围写明：
@@ -261,4 +304,18 @@ PATTERNS.md 全文控制在约 5 KB 之内。超过时，剪掉最不可执行
 5. **挑第一个任务**。做到完整（skill + check + 至少一次本地测试）。把学到的写进 PATTERNS.md。换下一个任务。
 6. **任务做到第 5 个、第 10 个时**：停下来重读 PATTERNS.md。如果新积累的 pattern 暗示要重构早期工作，**现在做**（便宜）而不是更晚（昂贵）。
+### 为什么 PATTERNS.md 要先写、写在 skill 代码之前
+如果你在 PATTERNS.md 还不存在的时候就开始写 skill 代码（rule_skills/<id>/check.py），**停**。哪怕只是 200 字节的初始 PATTERNS.md（"决定走 Shannon-Huffman；第一条难规则 R028 决定 verdict 形状；样本语料表头中英双语"）也能搭起框架。后续每条规则少重新推导一次同样的形状，整体能省 4 倍时间。
+❌ "我先把 skill 写完，等有洞察再写 PATTERNS.md。"
+到你写完 N 个 skill 时，你已经做了 N 个隐式决定（verdict 形状、chunker 边界、worker tier）——每条规则都是从零推导。重构需要碰 N 个文件，而不是一个。
+✅ "先写 PATTERNS.md（哪怕是初步的），写每条新规则之前先重读，发现新东西就回头更新。"
+PATTERNS.md 是项目的索引卡片。工作之前搭好它、工作中更新它、工作之后从中收割。
+E2E #6 v070 暴露了这个：DS 在用户介入回退之后才写 PATTERNS.md，而那之前每条 skill 的设计决定都已经各自固化、之后还要再碰一遍。v0.7.1 把这个引导写得更明确。
 引擎从文件系统推导里程碑（v0.7.0 Group A）会按磁盘事实核验覆盖率，无论你怎么切分工作。TaskBoard 是你的草稿；磁盘才是契约。