kc-beta 0.5.6 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,20 @@
1
1
  import { withRetry } from "./retry.js";
2
2
 
3
+ // A5: SSE accumulator safety cap. If a provider ever sends an abnormally
4
+ // large `data: ...` line without a newline terminator, the parser's
5
+ // `buffer += decoder.decode(chunk)` + `buffer.split("\n")` would grow
6
+ // unbounded and trigger O(n²) splitting once it gets into the hundreds
7
+ // of MB. 8 MB is multiple orders of magnitude above any legitimate single
8
+ // SSE frame (largest seen in the wild: ~80 KB for multi-tool-call deltas).
9
+ const SSE_BUFFER_CAP_BYTES = 8 * 1024 * 1024;
10
+
11
+ class SseOverflowError extends Error {
12
+ constructor(bytes) {
13
+ super(`SSE buffer overflow (${bytes} bytes without newline) — aborting stream`);
14
+ this.code = "SSE_BUFFER_OVERFLOW";
15
+ }
16
+ }
17
+
3
18
  /**
4
19
  * Multi-protocol LLM client using native fetch + SSE parsing.
5
20
  * Supports OpenAI-compatible APIs and Anthropic Messages API.
@@ -144,26 +159,56 @@ export class LLMClient {
144
159
  async *streamChat({ model, messages, tools, maxTokens }) {
145
160
  const body = this._buildStreamBody({ model, messages, tools, maxTokens });
146
161
 
147
- const resp = await withRetry(async () => {
148
- const r = await fetch(this._getEndpoint(), {
149
- method: "POST",
150
- headers: this._buildHeaders(),
151
- body: JSON.stringify(body),
162
+ let resp;
163
+ try {
164
+ resp = await withRetry(async () => {
165
+ const r = await fetch(this._getEndpoint(), {
166
+ method: "POST",
167
+ headers: this._buildHeaders(),
168
+ body: JSON.stringify(body),
169
+ });
170
+ if (!r.ok) {
171
+ const text = await r.text();
172
+ const err = new Error(`LLM API error ${r.status}: ${text}`);
173
+ err.status = r.status;
174
+ err.retryAfter = r.headers.get("retry-after");
175
+ err.streamTermination = "http_error";
176
+ throw err;
177
+ }
178
+ return r;
152
179
  });
153
- if (!r.ok) {
154
- const text = await r.text();
155
- const err = new Error(`LLM API error ${r.status}: ${text}`);
156
- err.status = r.status;
157
- err.retryAfter = r.headers.get("retry-after");
158
- throw err;
159
- }
160
- return r;
161
- });
180
+ } catch (err) {
181
+ // A8: Any pre-stream failure (network, auth, 4xx/5xx after retry) is
182
+ // tagged and re-thrown. Engine's outer catch sees exactly one tagged
183
+ // error event.
184
+ if (!err.streamTermination) err.streamTermination = "connect_error";
185
+ throw err;
186
+ }
162
187
 
163
- if (this.apiFormat === "anthropic") {
164
- yield* this._parseAnthropicSSE(resp.body);
165
- } else {
166
- yield* this._parseOpenaiSSE(resp.body);
188
+ // A8: Wrap the SSE consumption so ALL termination paths — clean EOS,
189
+ // mid-token abort, SSE overflow, provider disconnect — surface as a
190
+ // single tagged error the engine can report consistently. The inner
191
+ // parsers throw for overflow (A5) and return silently on clean EOS;
192
+ // mid-stream socket errors (undici "terminated") raise here.
193
+ try {
194
+ if (this.apiFormat === "anthropic") {
195
+ yield* this._parseAnthropicSSE(resp.body);
196
+ } else {
197
+ yield* this._parseOpenaiSSE(resp.body);
198
+ }
199
+ } catch (err) {
200
+ if (!err.streamTermination) {
201
+ if (err.code === "SSE_BUFFER_OVERFLOW") err.streamTermination = "sse_overflow";
202
+ else if (err.name === "AbortError") err.streamTermination = "aborted";
203
+ else if (/terminated|reset|ECONNRESET|UND_ERR_ABORTED/i.test(err.message || err.code || ""))
204
+ err.streamTermination = "stream_terminated";
205
+ else err.streamTermination = "stream_error";
206
+ }
207
+ throw err;
208
+ } finally {
209
+ // Best-effort: cancel the body so the underlying socket returns to the
210
+ // connection pool even if the consumer bailed mid-stream.
211
+ try { await resp.body?.cancel?.(); } catch { /* ignore */ }
167
212
  }
168
213
  }
169
214
 
@@ -261,6 +306,8 @@ export class LLMClient {
261
306
 
262
307
  for await (const chunk of body) {
263
308
  buffer += decoder.decode(chunk, { stream: true });
309
+ // A5: bail out before O(n²) split explodes on pathological input.
310
+ if (buffer.length > SSE_BUFFER_CAP_BYTES) throw new SseOverflowError(buffer.length);
264
311
  const lines = buffer.split("\n");
265
312
  buffer = lines.pop();
266
313
 
@@ -313,6 +360,8 @@ export class LLMClient {
313
360
 
314
361
  for await (const rawChunk of body) {
315
362
  buffer += decoder.decode(rawChunk, { stream: true });
363
+ // A5: cap applies to both SSE parsers.
364
+ if (buffer.length > SSE_BUFFER_CAP_BYTES) throw new SseOverflowError(buffer.length);
316
365
  const lines = buffer.split("\n");
317
366
  buffer = lines.pop();
318
367
 
@@ -0,0 +1,186 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import { PipelineEvent } from "./index.js";
4
+ import { Pipeline } from "./base.js";
5
+ import { normalizeRuleCatalog } from "../rule-catalog-normalize.js";
6
+
7
+ /**
8
+ * E1: FINALIZATION — the 7th phase. Runs after PRODUCTION_QC has shown
9
+ * the system working. Goal: turn the working system into a shippable
10
+ * deliverable.
11
+ *
12
+ * Responsibilities (observed via this pipeline's describeState + exit
13
+ * criteria; the agent does the actual work using workspace_file +
14
+ * sandbox_exec):
15
+ * 1. rule_skills/README.md — inventory + how-to-run section.
16
+ * 2. rule_skills/coverage_report.md — rule-id → skill-file mapping,
17
+ * including which rules are "not_applicable" per D6 classification.
18
+ * 3. output/final_dashboard.html — snapshot of the final metrics.
19
+ * 4. (Optional) Reorganized rule_skills/<rule_id>/ canonical layout:
20
+ * when skills were written grouped (check_r002_r007.py), create
21
+ * thin-link dirs for each constituent rule_id pointing at the
22
+ * grouped file. Skipped if rule_skills/ is already per-rule.
23
+ *
24
+ * Exit criteria: all three deliverable files exist. The agent is free
25
+ * to produce more artifacts; these are the minimum-viable finalization
26
+ * set the pipeline requires before marking the release-ready.
27
+ *
28
+ * No successor phase — this is the terminal state. The agent can
29
+ * continue working in this phase (e.g. producing additional dashboards
30
+ * on request), but auto-advance stops here.
31
+ */
32
+ export class FinalizationPipeline extends Pipeline {
33
+ constructor(workspace) {
34
+ super();
35
+ this._workspace = workspace;
36
+ this.readmeWritten = false;
37
+ this.coverageReportWritten = false;
38
+ this.finalDashboardWritten = false;
39
+ this.canonicalLayoutDone = false;
40
+ this._scanWorkspace();
41
+ }
42
+
43
+ _scanWorkspace() {
44
+ const cwd = this._workspace.cwd;
45
+ this.readmeWritten = fs.existsSync(path.join(cwd, "rule_skills", "README.md"));
46
+ this.coverageReportWritten = fs.existsSync(path.join(cwd, "rule_skills", "coverage_report.md"));
47
+ this.finalDashboardWritten = fs.existsSync(path.join(cwd, "output", "final_dashboard.html"));
48
+ // Canonical layout: every rule_id in the catalog has a dedicated
49
+ // directory OR a thin-link stub under rule_skills/<rule_id>/. When
50
+ // skills are already per-rule (every rule has its own dir) this is
51
+ // trivially true. When skills are grouped, the agent creates
52
+ // per-rule stub dirs that reference the grouped file. We approximate
53
+ // "canonical" by checking: does every catalog rule_id have a
54
+ // matching directory under rule_skills/?
55
+ this.canonicalLayoutDone = this._checkCanonicalLayout();
56
+ }
57
+
58
+ _checkCanonicalLayout() {
59
+ const cwd = this._workspace.cwd;
60
+ const catalogPath = path.join(cwd, "rules", "catalog.json");
61
+ const skillsDir = path.join(cwd, "rule_skills");
62
+ if (!fs.existsSync(catalogPath) || !fs.existsSync(skillsDir)) return false;
63
+ let rules;
64
+ try {
65
+ rules = normalizeRuleCatalog(JSON.parse(fs.readFileSync(catalogPath, "utf-8")));
66
+ } catch { return false; }
67
+ if (rules.length === 0) return false;
68
+
69
+ let existingDirs;
70
+ try {
71
+ existingDirs = new Set(
72
+ fs.readdirSync(skillsDir, { withFileTypes: true })
73
+ .filter((e) => e.isDirectory())
74
+ .map((e) => e.name),
75
+ );
76
+ } catch { return false; }
77
+
78
+ // Every rule id should have a matching directory. Directory name
79
+ // matches rule id (R014) OR falls inside a range dir (R078_R128).
80
+ const rangeDirs = [...existingDirs].map((name) => {
81
+ const m = name.match(/^R0*(\d+)[_-]R0*(\d+)$/i);
82
+ if (m) return { name, lo: parseInt(m[1], 10), hi: parseInt(m[2], 10) };
83
+ return null;
84
+ }).filter(Boolean);
85
+
86
+ for (const r of rules) {
87
+ if (!r.id) continue;
88
+ if (existingDirs.has(r.id)) continue;
89
+ const m = r.id.match(/^R0*(\d+)$/i);
90
+ if (m) {
91
+ const n = parseInt(m[1], 10);
92
+ if (rangeDirs.some((rd) => rd.lo <= n && n <= rd.hi)) continue;
93
+ }
94
+ return false;
95
+ }
96
+ return true;
97
+ }
98
+
99
+ describeState() {
100
+ this._scanWorkspace();
101
+ const checklist = [
102
+ `- ${this.readmeWritten ? "✅" : "⏳"} rule_skills/README.md`,
103
+ `- ${this.coverageReportWritten ? "✅" : "⏳"} rule_skills/coverage_report.md`,
104
+ `- ${this.finalDashboardWritten ? "✅" : "⏳"} output/final_dashboard.html`,
105
+ `- ${this.canonicalLayoutDone ? "✅" : "⏳"} rule_skills/ canonical per-rule layout`,
106
+ ];
107
+ const parts = [
108
+ "## Phase: FINALIZATION\n" +
109
+ "Turn the working verification system into a shippable deliverable. The " +
110
+ "pipeline has completed end-to-end; now package it for handoff. This is " +
111
+ "the terminal phase — no successor. You can continue producing artifacts " +
112
+ "here on request.\n\n" +
113
+ "**Tasks to complete** (the pipeline considers the phase done when all " +
114
+ "four checkmarks are green):\n\n" +
115
+ checklist.join("\n") + "\n\n" +
116
+ "### What each artifact should contain\n\n" +
117
+ "- **README.md**: top of `rule_skills/` with file inventory, how to run " +
118
+ " `run_all_checks.py` (if present), input format, expected output format, " +
119
+ " dependencies, and a short 'what this does' for a reader who hasn't " +
120
+ " seen the project.\n" +
121
+ "- **coverage_report.md**: one row per rule_id in catalog.json. Columns: " +
122
+ " rule_id, source_ref, skill file (`check_r014.py` or `check_r002_r007.py`), " +
123
+ " tested (Y/N), latest accuracy, retries, applicable-to-this-bundle " +
124
+ " (Y/N from D6 classification). Rules marked not_applicable should be " +
125
+ " grouped at the bottom with a note explaining which bundle-type " +
126
+ " filtered them out.\n" +
127
+ "- **final_dashboard.html**: single-page snapshot. Reuse the " +
128
+ " `dashboard_render` tool — it knows the metrics shape. This is the " +
129
+ " hand-off artifact the developer user opens to see the final state.\n" +
130
+ "- **canonical layout**: the simplest check is `ls rule_skills/ | " +
131
+ " wc -l` ≈ number of rules in the catalog. When grouped files exist " +
132
+ " (`check_r002_r007.py`), create stub `rule_skills/R002/` through " +
133
+ " `rule_skills/R007/` each containing a one-line SKILL.md that points " +
134
+ " at the grouped file. This keeps downstream per-rule lookups simple.",
135
+ ];
136
+ return parts.join("\n\n");
137
+ }
138
+
139
+ onToolResult(toolName, toolInput, result) {
140
+ if (result.isError) return null;
141
+ const wasReady = this.exitCriteriaMet();
142
+ const touchedPath = String(
143
+ toolInput?.path || toolInput?.command || "",
144
+ );
145
+ // Re-scan when the agent writes to any relevant path
146
+ if (
147
+ touchedPath.includes("rule_skills/") ||
148
+ touchedPath.includes("output/final_dashboard") ||
149
+ touchedPath.includes("coverage_report")
150
+ ) {
151
+ this._scanWorkspace();
152
+ }
153
+ if (!wasReady && this.exitCriteriaMet()) {
154
+ // Terminal phase — no nextPhase. Pipeline event signals "done."
155
+ return new PipelineEvent({
156
+ type: "phase_ready",
157
+ message: "Finalization artifacts complete. Session deliverable is ready.",
158
+ nextPhase: null,
159
+ });
160
+ }
161
+ return null;
162
+ }
163
+
164
+ exitCriteriaMet() {
165
+ return this.readmeWritten &&
166
+ this.coverageReportWritten &&
167
+ this.finalDashboardWritten &&
168
+ this.canonicalLayoutDone;
169
+ }
170
+
171
+ exportState() {
172
+ return {
173
+ readmeWritten: this.readmeWritten,
174
+ coverageReportWritten: this.coverageReportWritten,
175
+ finalDashboardWritten: this.finalDashboardWritten,
176
+ canonicalLayoutDone: this.canonicalLayoutDone,
177
+ };
178
+ }
179
+
180
+ importState(data) {
181
+ if (typeof data?.readmeWritten === "boolean") this.readmeWritten = data.readmeWritten;
182
+ if (typeof data?.coverageReportWritten === "boolean") this.coverageReportWritten = data.coverageReportWritten;
183
+ if (typeof data?.finalDashboardWritten === "boolean") this.finalDashboardWritten = data.finalDashboardWritten;
184
+ if (typeof data?.canonicalLayoutDone === "boolean") this.canonicalLayoutDone = data.canonicalLayoutDone;
185
+ }
186
+ }
@@ -1,5 +1,12 @@
1
1
  /**
2
2
  * Pipeline phases — sequential workflow of the KC Agent methodology.
3
+ *
4
+ * v0.6.0 E1: FINALIZATION added as the 7th phase. It's a cleanup /
5
+ * deliverable-packaging phase that runs after PRODUCTION_QC has
6
+ * established the system is operating correctly: reorganize
7
+ * rule_skills/ into a canonical layout, write README + coverage
8
+ * report, snapshot a final dashboard, archive stale retry outputs.
9
+ * Short phase, a handful of tasks, driven by a finalization skill.
3
10
  */
4
11
  export const Phase = Object.freeze({
5
12
  BOOTSTRAP: "bootstrap",
@@ -8,6 +15,7 @@ export const Phase = Object.freeze({
8
15
  SKILL_TESTING: "skill_testing",
9
16
  DISTILLATION: "distillation",
10
17
  PRODUCTION_QC: "production_qc",
18
+ FINALIZATION: "finalization",
11
19
  });
12
20
 
13
21
  /**
@@ -138,6 +138,31 @@ export class ProjectInitializer extends Pipeline {
138
138
  this.configReady = !!gc.api_key;
139
139
  }
140
140
 
141
+ /**
142
+ * F1b: Worker LLM health snapshot. Static check only — inspect whether
143
+ * TIER1-4 and OCR_MODEL_TIER1 are populated in .env. Does NOT make
144
+ * network calls — a live ping would be invasive for bootstrap (slow,
145
+ * charges money, and the worker LLM isn't actually used until
146
+ * DISTILLATION). Surfacing the config state is enough for bootstrap.
147
+ * The agent can then decide to validate via worker_llm_call later if
148
+ * warranted. Returns null when no .env exists yet.
149
+ */
150
+ _workerConfigSnapshot() {
151
+ const envPath = path.join(this._workspace.cwd, ".env");
152
+ if (!fs.existsSync(envPath)) return null;
153
+ const tiers = { TIER1: "", TIER2: "", TIER3: "", TIER4: "", OCR_MODEL_TIER1: "" };
154
+ try {
155
+ for (const line of fs.readFileSync(envPath, "utf-8").split("\n")) {
156
+ for (const k of Object.keys(tiers)) {
157
+ if (line.startsWith(`${k}=`)) {
158
+ tiers[k] = line.slice(k.length + 1).trim();
159
+ }
160
+ }
161
+ }
162
+ } catch { return null; }
163
+ return tiers;
164
+ }
165
+
141
166
  _loadGlobalConfig() {
142
167
  const p = path.join(os.homedir(), ".kc_agent", "config.json");
143
168
  if (fs.existsSync(p)) { try { return JSON.parse(fs.readFileSync(p, "utf-8")); } catch { /* skip */ } }
@@ -158,6 +183,21 @@ export class ProjectInitializer extends Pipeline {
158
183
  if (completed.length) parts.push("### Done\n" + completed.map((c) => `- [x] ${c}`).join("\n"));
159
184
  if (pending.length) parts.push("### Needed\n" + pending.map((p) => `- [ ] ${p}`).join("\n"));
160
185
 
186
+ // F1b: surface worker-LLM tier status as part of bootstrap state so
187
+ // the agent can flag missing tiers to the developer user upfront,
188
+ // rather than hitting "worker LLM unreachable" hours later during
189
+ // DISTILLATION. Static inspection only — no network calls.
190
+ const workerConfig = this._workerConfigSnapshot();
191
+ if (workerConfig) {
192
+ const tierLines = [];
193
+ for (const [k, v] of Object.entries(workerConfig)) {
194
+ if (v) tierLines.push(`- ${k}: ${v}`);
195
+ else tierLines.push(`- ${k}: ⚠️ (empty — set before DISTILLATION, or worker_llm_call tools will fail)`);
196
+ }
197
+ parts.push("### Worker LLM tiers (.env snapshot)\n" + tierLines.join("\n") +
198
+ "\n\nThese drive `worker_llm_call`, `workflow_run`, `document_parse` OCR, etc. Empty tiers don't block bootstrap — but DISTILLATION requires at least TIER1 to be live. Discuss with the developer user if any are missing.");
199
+ }
200
+
161
201
  if (this.exitCriteriaMet()) {
162
202
  parts.push("### Exit\nBootstrap requirements met. Proceed to EXTRACTION.");
163
203
  }
@@ -34,6 +34,14 @@ export class SkillAuthoringPipeline extends Pipeline {
34
34
  _scanSkills() {
35
35
  this.skillsAuthored = [];
36
36
  this.skillsWithScripts = [];
37
+ // D2: rule_ids that are covered by some authored skill — whether that
38
+ // skill is single-rule (rule_skills/R014/) or grouped
39
+ // (rule_skills/SK02/check_r002_r007.py). Populated by _walkForRuleIds
40
+ // below so the exit criterion counts DISTINCT rule coverage rather
41
+ // than skill-directory count, which over-counts when skills are
42
+ // grouped (session 6304673afaa0's rule_skills/ had 289 rules packed
43
+ // into 23 skill files).
44
+ this.ruleIdsCovered = new Set();
37
45
  const dir = path.join(this._workspace.cwd, "rule_skills");
38
46
  if (!fs.existsSync(dir)) return;
39
47
  for (const e of fs.readdirSync(dir, { withFileTypes: true })) {
@@ -46,19 +54,97 @@ export class SkillAuthoringPipeline extends Pipeline {
46
54
  if (fs.existsSync(scriptsDir) && fs.readdirSync(scriptsDir).length > 0) {
47
55
  this.skillsWithScripts.push(e.name);
48
56
  }
57
+ this._walkForRuleIds(skillPath);
49
58
  }
50
59
  }
51
60
 
61
+ /**
62
+ * D2: Find rule_ids referenced by any file under the skill directory.
63
+ * Recognizes three naming patterns from actual sessions:
64
+ * - Directory name matches a rule: rule_skills/R014/
65
+ * - Single-rule script: check_r014.py
66
+ * - Grouped script: check_r002_r007.py → covers R002 through R007
67
+ */
68
+ _walkForRuleIds(skillDir) {
69
+ const dirName = path.basename(skillDir);
70
+ const dirMatch = dirName.match(/^R0*(\d+)$/i);
71
+ if (dirMatch) this.ruleIdsCovered.add(`R${String(parseInt(dirMatch[1], 10)).padStart(3, "0")}`);
72
+
73
+ const walk = (d) => {
74
+ let entries;
75
+ try { entries = fs.readdirSync(d, { withFileTypes: true }); }
76
+ catch { return; }
77
+ for (const e of entries) {
78
+ if (e.name.startsWith(".")) continue;
79
+ const p = path.join(d, e.name);
80
+ if (e.isDirectory()) { walk(p); continue; }
81
+ // Per-rule: check_r014.py
82
+ const single = e.name.match(/check_r0*(\d+)\.py$/i);
83
+ if (single) {
84
+ this.ruleIdsCovered.add(`R${String(parseInt(single[1], 10)).padStart(3, "0")}`);
85
+ continue;
86
+ }
87
+ // Grouped: check_r002_r007.py, check_r002-r007.py, check_r59_r77.py
88
+ const grouped = e.name.match(/check_r0*(\d+)[_-]+r0*(\d+)\.py$/i);
89
+ if (grouped) {
90
+ const lo = parseInt(grouped[1], 10);
91
+ const hi = parseInt(grouped[2], 10);
92
+ for (let n = lo; n <= hi; n++) {
93
+ this.ruleIdsCovered.add(`R${String(n).padStart(3, "0")}`);
94
+ }
95
+ continue;
96
+ }
97
+ // Directory names that encode ranges: R078_R128/
98
+ // handled by caller passing skillDir
99
+ }
100
+ };
101
+ // Also handle dirs named like R078_R128/
102
+ const rangeDir = dirName.match(/^R0*(\d+)[_-]R0*(\d+)$/i);
103
+ if (rangeDir) {
104
+ const lo = parseInt(rangeDir[1], 10);
105
+ const hi = parseInt(rangeDir[2], 10);
106
+ for (let n = lo; n <= hi; n++) {
107
+ this.ruleIdsCovered.add(`R${String(n).padStart(3, "0")}`);
108
+ }
109
+ }
110
+ walk(skillDir);
111
+ }
112
+
52
113
  describeState() {
53
114
  this._scanWorkspace();
54
115
  const total = this.totalRules.length;
55
- const authored = this.skillsAuthored.length;
56
- const remaining = this.totalRules.filter((r) => !this.skillsAuthored.includes(r));
57
- const parts = ["## Phase: SKILL_AUTHORING\nWrite verification skills for each extracted rule. Skills are first-class deliverables — they may serve as the production solution when worker LLM workflows are insufficient. Follow Anthropic skill-creator format. This is BUILD mode."];
58
- parts.push(`### Progress\n- Rules: ${total}\n- Skills authored: ${authored}\n- Skills with scripts/: ${this.skillsWithScripts.length}${remaining.length > 0 ? `\n- Remaining: ${remaining.slice(0, 10).join(", ")}` : ""}`);
116
+ const covered = this.ruleIdsCovered.size;
117
+ const uncovered = this.totalRules.filter((r) => !this.ruleIdsCovered.has(r));
118
+ const parts = [
119
+ "## Phase: SKILL_AUTHORING\n" +
120
+ "Write verification skills for each extracted rule. Skills are first-class " +
121
+ "deliverables — they may serve as the production solution when worker LLM " +
122
+ "workflows are insufficient. Follow Anthropic skill-creator format. This is " +
123
+ "BUILD mode.\n\n" +
124
+ // D2: soft granularity nudge
125
+ "**Granularity preference:** 1 rule = 1 skill directory. Group rules into " +
126
+ "the same file ONLY when they share evidence and fail together (e.g. " +
127
+ "siblings from the same required-fields table). When grouping, name the " +
128
+ "file with the range: `check_r002_r007.py`. Downstream consumers " +
129
+ "(workflow-run, dashboards) count rule coverage by parsing these names, " +
130
+ "so the file-naming matters.\n\n" +
131
+ "**Do not write to rules/catalog.json via sandbox_exec.** Use the " +
132
+ "`rule_catalog` tool for any catalog edits — sandbox_exec bypasses the " +
133
+ "workspace file lock and races with parallel workers."
134
+ ];
135
+ parts.push(
136
+ `### Progress (rule-id coverage, D2)\n` +
137
+ `- Total rules in catalog: ${total}\n` +
138
+ `- Rule ids covered by some skill: ${covered}\n` +
139
+ `- Skill directories authored: ${this.skillsAuthored.length}\n` +
140
+ `- Skills with scripts/: ${this.skillsWithScripts.length}` +
141
+ (uncovered.length > 0
142
+ ? `\n- Missing coverage (${uncovered.length}): ${uncovered.slice(0, 15).join(", ")}${uncovered.length > 15 ? "…" : ""}`
143
+ : ""),
144
+ );
59
145
 
60
146
  if (this.exitCriteriaMet()) {
61
- parts.push("### Exit\nAll rules have skills. Proceed to SKILL_TESTING.");
147
+ parts.push("### Exit\nAll rule ids are covered by some skill. Proceed to SKILL_TESTING.");
62
148
  }
63
149
  return parts.join("\n\n");
64
150
  }
@@ -75,7 +161,15 @@ export class SkillAuthoringPipeline extends Pipeline {
75
161
 
76
162
  exitCriteriaMet() {
77
163
  if (!this.totalRules.length) return false;
78
- return this.skillsAuthored.length >= this.totalRules.length && this.skillsWithScripts.length >= this.skillsAuthored.length * 0.5;
164
+ // D2: exit requires distinct rule-id coverage, not skill-dir count.
165
+ // Original heuristic (skillsAuthored >= totalRules) passed the phase
166
+ // even when KC grouped many rules into one file — a false signal when
167
+ // the user wants per-rule verification. Now every rule id in the
168
+ // catalog must appear in some skill name. The scripts/ heuristic is
169
+ // preserved as a secondary gate on skill depth.
170
+ const allCovered = this.totalRules.every((r) => this.ruleIdsCovered.has(r));
171
+ if (!allCovered) return false;
172
+ return this.skillsWithScripts.length >= Math.max(1, this.skillsAuthored.length * 0.5);
79
173
  }
80
174
 
81
175
  exportState() {
@@ -5,6 +5,46 @@ import { fileURLToPath } from "node:url";
5
5
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
6
6
  const BUNDLED_SKILLS_DIR = path.resolve(__dirname, "../../template/skills");
7
7
 
8
+ // D3b: Phase-relevance map. Skills not listed here are always visible
9
+ // (safe default for future additions). Skills listed here are only
10
+ // included in the context index for the named phases — unrelated
11
+ // phases save the system-prompt budget. This is a soft filter: the
12
+ // agent can still `workspace_file` read any skill on-demand.
13
+ //
14
+ // Keep this close to the skill set it describes — one hardcoded table
15
+ // per release, not spread across files. When adding a skill to
16
+ // template/skills/, add it here if phase-specific, or leave it out
17
+ // to default to always-visible.
18
+ const PHASE_RELEVANT_SKILLS = {
19
+ "bootstrap-workspace": ["bootstrap"],
20
+ "rule-extraction": ["bootstrap", "extraction"],
21
+ "rule-graph": ["extraction", "skill_authoring"],
22
+ "task-decomposition": ["extraction", "skill_authoring", "distillation"],
23
+ "skill-authoring": ["skill_authoring", "skill_testing"],
24
+ "skill-to-workflow": ["distillation"],
25
+ "evolution-loop": ["skill_testing", "distillation", "production_qc"],
26
+ "version-control": ["bootstrap", "extraction", "skill_authoring", "skill_testing", "distillation", "production_qc", "finalization"],
27
+ "quality-control": ["production_qc", "finalization"],
28
+ "confidence-system": ["distillation", "production_qc"],
29
+ "dashboard-reporting": ["production_qc", "finalization"],
30
+ "cross-document-verification": ["production_qc"],
31
+ "corner-case-management": ["skill_testing", "distillation", "production_qc"],
32
+ "data-sensibility": ["extraction", "skill_authoring"],
33
+ "entity-extraction": ["skill_authoring", "distillation"],
34
+ "document-parsing": ["bootstrap", "extraction", "skill_authoring"],
35
+ "document-chunking": ["bootstrap", "extraction"],
36
+ "tree-processing": ["skill_authoring", "skill_testing"],
37
+ "compliance-judgment": ["skill_authoring", "skill_testing", "production_qc"],
38
+ "skill-creator": ["skill_authoring"],
39
+ };
40
+
41
+ function isSkillRelevantToPhase(skillName, phase) {
42
+ const relevantPhases = PHASE_RELEVANT_SKILLS[skillName];
43
+ if (!relevantPhases) return true; // unknown skill → always visible
44
+ if (!phase) return true; // caller didn't pass phase → always visible
45
+ return relevantPhases.includes(phase);
46
+ }
47
+
8
48
  /**
9
49
  * Discover and index meta skills from template/skills/.
10
50
  * Follows Claude Code's pattern: skills are NOT dumped into the system prompt.
@@ -79,15 +119,25 @@ export class SkillLoader {
79
119
  /**
80
120
  * Format the skill index for injection into agent context.
81
121
  * Brief listing — agent reads full content on demand.
122
+ *
123
+ * D3b: when `phase` is provided, filter out skills that aren't relevant
124
+ * to the phase (per PHASE_RELEVANT_SKILLS). Unknown skills stay visible
125
+ * so new additions to template/skills/ aren't accidentally hidden.
126
+ *
127
+ * @param {string} [phase] - Current engine phase for filtering
82
128
  * @returns {string}
83
129
  */
84
- formatForContext() {
130
+ formatForContext(phase) {
85
131
  const index = this.getIndex();
86
132
  if (index.length === 0) return "";
87
133
 
88
- const metaMeta = index.filter((s) => s.category === "meta-meta");
89
- const meta = index.filter((s) => s.category === "meta");
90
- const other = index.filter((s) => s.category !== "meta-meta" && s.category !== "meta");
134
+ const visible = phase
135
+ ? index.filter((s) => isSkillRelevantToPhase(s.name, phase))
136
+ : index;
137
+
138
+ const metaMeta = visible.filter((s) => s.category === "meta-meta");
139
+ const meta = visible.filter((s) => s.category === "meta");
140
+ const other = visible.filter((s) => s.category !== "meta-meta" && s.category !== "meta");
91
141
 
92
142
  const lines = ["## Available Methodology Skills",
93
143
  "Read full skill content from the skills/ directory when needed.\n"];